@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
44
44
  return ggml_nbytes(tensor);
45
45
  }
46
46
 
47
- bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
48
- return buft->iface.supports_backend(buft, backend);
49
- }
50
-
51
47
  bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
52
48
  if (buft->iface.is_host) {
53
49
  return buft->iface.is_host(buft);
@@ -138,6 +134,10 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
138
134
  }
139
135
  }
140
136
 
137
+ enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
138
+ return buffer->usage;
139
+ }
140
+
141
141
  ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
142
142
  return buffer->buft;
143
143
  }
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
151
151
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
152
152
  ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
153
153
  if (dst_buf->iface.cpy_tensor) {
154
- return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
154
+ return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
155
155
  }
156
156
  return false;
157
157
  }
@@ -286,6 +286,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
286
286
  return backend->iface.supports_op(backend, op);
287
287
  }
288
288
 
289
+ bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
290
+ return backend->iface.supports_buft(backend, buft);
291
+ }
292
+
289
293
  bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
290
294
  if (backend->iface.offload_op != NULL) {
291
295
  return backend->iface.offload_op(backend, op);
@@ -394,7 +398,7 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
394
398
 
395
399
  // backend registry
396
400
 
397
- #define GGML_REG_MAX_BACKENDS 16
401
+ #define GGML_REG_MAX_BACKENDS 64
398
402
 
399
403
  struct ggml_backend_reg {
400
404
  char name[128];
@@ -445,6 +449,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
445
449
  extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
446
450
  ggml_backend_kompute_reg_devices();
447
451
  #endif
452
+
453
+ #ifdef GGML_USE_CANN
454
+ extern GGML_CALL int ggml_backend_cann_reg_devices(void);
455
+ ggml_backend_cann_reg_devices();
456
+ #endif
448
457
  }
449
458
 
450
459
  GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
@@ -639,12 +648,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
639
648
  GGML_UNUSED(buft);
640
649
  }
641
650
 
642
- GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
643
- return ggml_backend_is_cpu(backend);
644
-
645
- GGML_UNUSED(buft);
646
- }
647
-
648
651
  GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
649
652
  return true;
650
653
 
@@ -659,7 +662,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
659
662
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
660
663
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
661
664
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
662
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
663
665
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
664
666
  },
665
667
  /* .context = */ NULL,
@@ -715,7 +717,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
715
717
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
716
718
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
717
719
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
718
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
719
720
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
720
721
  },
721
722
  /* .context = */ NULL,
@@ -836,6 +837,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
836
837
  GGML_UNUSED(backend);
837
838
  }
838
839
 
840
+ GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
841
+ return ggml_backend_buft_is_host(buft);
842
+
843
+ GGML_UNUSED(backend);
844
+ }
845
+
839
846
  static struct ggml_backend_i cpu_backend_i = {
840
847
  /* .get_name = */ ggml_backend_cpu_name,
841
848
  /* .free = */ ggml_backend_cpu_free,
@@ -846,9 +853,11 @@ static struct ggml_backend_i cpu_backend_i = {
846
853
  /* .synchronize = */ NULL,
847
854
  /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
848
855
  /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
856
+ /* .graph_plan_update = */ NULL,
849
857
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
850
858
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
851
859
  /* .supports_op = */ ggml_backend_cpu_supports_op,
860
+ /* .supports_buft = */ ggml_backend_cpu_supports_buft,
852
861
  /* .offload_op = */ NULL,
853
862
  /* .event_new = */ NULL,
854
863
  /* .event_free = */ NULL,
@@ -1046,17 +1055,19 @@ struct ggml_backend_sched {
1046
1055
  ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
1047
1056
  ggml_gallocr_t galloc;
1048
1057
 
1049
- // hash keys of the nodes in the graph
1050
- struct ggml_hash_set hash_set;
1051
- // hash values
1052
- int * tensor_backend_id;
1053
- struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1058
+ // hash map of the nodes in the graph
1059
+ struct ggml_hash_set hash_set;
1060
+ int * hv_tensor_backend_ids; // [hash_set.size]
1061
+ struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
1054
1062
 
1055
1063
  int * node_backend_ids; // [graph_size]
1056
1064
  int * leaf_backend_ids; // [graph_size]
1057
1065
 
1066
+ int * prev_node_backend_ids; // [graph_size]
1067
+ int * prev_leaf_backend_ids; // [graph_size]
1068
+
1058
1069
  // copy of the graph with modified inputs
1059
- struct ggml_cgraph * graph;
1070
+ struct ggml_cgraph graph;
1060
1071
 
1061
1072
  // graph splits
1062
1073
  struct ggml_backend_sched_split * splits;
@@ -1075,17 +1086,16 @@ struct ggml_backend_sched {
1075
1086
  ggml_backend_sched_eval_callback callback_eval;
1076
1087
  void * callback_eval_user_data;
1077
1088
 
1078
- // align context_buffer to GGML_MEM_ALIGN
1079
- #ifdef _MSC_VER
1080
- __declspec(align(GGML_MEM_ALIGN))
1081
- #else
1082
- __attribute__((aligned(GGML_MEM_ALIGN)))
1083
- #endif
1084
- char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
1089
+ char * context_buffer;
1090
+ size_t context_buffer_size;
1091
+
1092
+ bool debug;
1085
1093
  };
1086
1094
 
1087
- #define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
1088
- #define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
1095
+ #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
1096
+ #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
1097
+ #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
1098
+ #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
1089
1099
 
1090
1100
  // returns the priority of the backend, lower id is higher priority
1091
1101
  static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
@@ -1097,22 +1107,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
1097
1107
  return -1;
1098
1108
  }
1099
1109
 
1100
- static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
1110
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
1101
1111
  ggml_backend_buffer_t buffer = tensor->buffer;
1102
1112
  if (buffer == NULL) {
1103
1113
  return -1;
1104
1114
  }
1105
1115
 
1106
- // find highest prio backend that supports the buffer type
1116
+ // find highest prio backend that supports the buffer type and the op
1107
1117
  for (int i = 0; i < sched->n_backends; i++) {
1108
- if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
1118
+ if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
1119
+ ggml_backend_supports_op(sched->backends[i], op)) {
1109
1120
  return i;
1110
1121
  }
1111
1122
  }
1112
1123
 
1113
- fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
1114
- __func__, ggml_backend_buffer_name(buffer), tensor->name);
1115
- GGML_ASSERT(false);
1124
+ #ifndef NDEBUG
1125
+ fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1126
+ __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
1127
+ #endif
1116
1128
 
1117
1129
  return -1;
1118
1130
  }
@@ -1131,7 +1143,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1131
1143
  // TODO: use supports_op to check if the backend supports the op
1132
1144
 
1133
1145
  // assign pre-allocated nodes to their backend
1134
- int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
1146
+ int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1135
1147
  if (cur_backend_id != -1) {
1136
1148
  SET_CAUSE(tensor, "1.dst");
1137
1149
  return cur_backend_id;
@@ -1139,7 +1151,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1139
1151
 
1140
1152
  // view_src
1141
1153
  if (tensor->view_src != NULL) {
1142
- cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1154
+ cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
1143
1155
  if (cur_backend_id != -1) {
1144
1156
  SET_CAUSE(tensor, "1.vsrc");
1145
1157
  return cur_backend_id;
@@ -1153,7 +1165,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1153
1165
  return cur_backend_id;
1154
1166
  }
1155
1167
 
1156
- // assign nodes that use weights to the backend of the weights
1157
1168
  // operations with weights are preferably run on the same backend as the weights
1158
1169
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1159
1170
  const struct ggml_tensor * src = tensor->src[i];
@@ -1161,11 +1172,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1161
1172
  continue;
1162
1173
  }
1163
1174
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1164
- int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
1175
+ int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1165
1176
  // check if a backend with higher prio wants to offload the op
1166
1177
  if (src_backend_id == sched->n_backends - 1) {
1167
1178
  for (int b = 0; b < src_backend_id; b++) {
1168
- if (ggml_backend_offload_op(sched->backends[b], tensor)) {
1179
+ if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
1169
1180
  SET_CAUSE(tensor, "1.off");
1170
1181
  return b;
1171
1182
  }
@@ -1223,10 +1234,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1223
1234
  }
1224
1235
  }
1225
1236
 
1226
- //#define DEBUG_PASS1
1227
- //#define DEBUG_PASS2
1228
- //#define DEBUG_PASS3
1229
- //#define DEBUG_PASS4
1237
+ static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
1238
+ ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1239
+ ggml_backend_buffer_type_t buft = NULL;
1240
+
1241
+ if (buf) {
1242
+ // the tensor is already allocated
1243
+ buft = buf->buft;
1244
+ } else {
1245
+ // see if the tensor already has a backend assigned, and use the buffer type of that backend
1246
+ int tensor_backend_id = tensor_backend_id(t);
1247
+ if (tensor_backend_id == -1 && t->view_src) {
1248
+ tensor_backend_id = tensor_backend_id(t->view_src);
1249
+ }
1250
+ if (tensor_backend_id != -1) {
1251
+ buft = sched->bufts[tensor_backend_id];
1252
+ }
1253
+ }
1254
+
1255
+ return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
1256
+ }
1257
+
1258
+ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1259
+ if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1260
+ *node_backend_id = cur_backend_id;
1261
+ SET_CAUSE(node, "2.sup");
1262
+ }
1263
+ }
1230
1264
 
1231
1265
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1232
1266
  static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
@@ -1236,7 +1270,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1236
1270
  sched->is_reset = false;
1237
1271
 
1238
1272
  struct ggml_init_params params = {
1239
- /* .mem_size = */ sizeof(sched->context_buffer),
1273
+ /* .mem_size = */ sched->context_buffer_size,
1240
1274
  /* .mem_buffer = */ sched->context_buffer,
1241
1275
  /* .no_alloc = */ true
1242
1276
  };
@@ -1245,52 +1279,52 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1245
1279
 
1246
1280
  sched->ctx = ggml_init(params);
1247
1281
  if (sched->ctx == NULL) {
1248
- fprintf(stderr, "%s: failed to initialize context\n", __func__);
1249
- GGML_ASSERT(false);
1282
+ GGML_ABORT("%s: failed to initialize context\n", __func__);
1250
1283
  }
1251
1284
 
1252
1285
  // pass 1: assign backends to ops with pre-allocated inputs
1253
1286
  for (int i = 0; i < graph->n_leafs; i++) {
1254
1287
  struct ggml_tensor * leaf = graph->leafs[i];
1255
1288
  int * leaf_backend_id = &tensor_backend_id(leaf);
1256
- if (*leaf_backend_id != -1) {
1257
- // do not overwrite user assignments
1258
- continue;
1289
+ // do not overwrite user assignments
1290
+ if (*leaf_backend_id == -1) {
1291
+ *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1259
1292
  }
1260
- *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1261
1293
  }
1262
1294
 
1263
1295
  for (int i = 0; i < graph->n_nodes; i++) {
1264
1296
  struct ggml_tensor * node = graph->nodes[i];
1265
1297
  int * node_backend_id = &tensor_backend_id(node);
1266
- if (*node_backend_id != -1) {
1267
- // do not overwrite user assignments
1268
- continue;
1269
- }
1270
- *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1271
- // src
1272
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1273
- struct ggml_tensor * src = node->src[j];
1274
- if (src == NULL) {
1298
+ // do not overwrite user assignments
1299
+ if (*node_backend_id == -1) {
1300
+ *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1301
+
1302
+ #if 0
1303
+ // src
1304
+ if (node->op == GGML_OP_NONE) {
1275
1305
  continue;
1276
1306
  }
1277
- int * src_backend_id = &tensor_backend_id(src);
1278
- if (*src_backend_id == -1) {
1279
- *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
1307
+
1308
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1309
+ struct ggml_tensor * src = node->src[j];
1310
+ if (src == NULL) {
1311
+ continue;
1312
+ }
1313
+ int * src_backend_id = &tensor_backend_id(src);
1314
+ if (*src_backend_id == -1) {
1315
+ *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
1316
+ }
1280
1317
  }
1318
+ #endif
1281
1319
  }
1282
1320
  }
1283
- #ifdef DEBUG_PASS1
1284
- fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1285
- #endif
1286
1321
 
1287
1322
  // pass 2: expand current backend assignments
1288
1323
  // assign the same backend to adjacent nodes
1289
1324
  // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1290
1325
  // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1291
-
1292
-
1293
- // pass 2.2 expand gpu down
1326
+ // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1327
+ // expand gpu down
1294
1328
  {
1295
1329
  int cur_backend_id = -1;
1296
1330
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -1306,13 +1340,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1306
1340
  } else {
1307
1341
  cur_backend_id = *node_backend_id;
1308
1342
  }
1309
- } else {
1310
- *node_backend_id = cur_backend_id;
1311
- SET_CAUSE(node, "2.2");
1343
+ } else if (cur_backend_id != -1) {
1344
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1312
1345
  }
1313
1346
  }
1314
1347
  }
1315
- // pass 2.1 expand gpu up
1348
+ // expand gpu up
1316
1349
  {
1317
1350
  int cur_backend_id = -1;
1318
1351
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1328,13 +1361,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1328
1361
  } else {
1329
1362
  cur_backend_id = *node_backend_id;
1330
1363
  }
1331
- } else {
1332
- *node_backend_id = cur_backend_id;
1333
- SET_CAUSE(node, "2.1");
1364
+ } else if (cur_backend_id != -1) {
1365
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1334
1366
  }
1335
1367
  }
1336
1368
  }
1337
- // pass 2.4 expand rest down
1369
+ // expand rest down
1338
1370
  {
1339
1371
  int cur_backend_id = -1;
1340
1372
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -1345,13 +1377,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1345
1377
  int * node_backend_id = &tensor_backend_id(node);
1346
1378
  if (*node_backend_id != -1) {
1347
1379
  cur_backend_id = *node_backend_id;
1348
- } else {
1349
- *node_backend_id = cur_backend_id;
1350
- SET_CAUSE(node, "2.4");
1380
+ } else if (cur_backend_id != -1) {
1381
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1351
1382
  }
1352
1383
  }
1353
1384
  }
1354
- // pass 2.3 expand rest up
1385
+ // expand rest up
1355
1386
  {
1356
1387
  int cur_backend_id = -1;
1357
1388
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1362,24 +1393,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1362
1393
  int * node_backend_id = &tensor_backend_id(node);
1363
1394
  if (*node_backend_id != -1) {
1364
1395
  cur_backend_id = *node_backend_id;
1365
- } else {
1366
- *node_backend_id = cur_backend_id;
1367
- SET_CAUSE(node, "2.3");
1396
+ } else if (cur_backend_id != -1) {
1397
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1368
1398
  }
1369
1399
  }
1370
1400
  }
1371
1401
 
1372
- #ifdef DEBUG_PASS2
1373
- fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1374
- #endif
1402
+ // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1403
+ // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1404
+ // however, we also need to verify that the sources are in compatible buffer types
1405
+ // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1406
+ // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1407
+ // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1408
+ // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1409
+ // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1410
+ for (int i = 0; i < graph->n_nodes; i++) {
1411
+ struct ggml_tensor * node = graph->nodes[i];
1412
+ if (ggml_is_view_op(node->op)) {
1413
+ continue;
1414
+ }
1415
+ int * node_backend_id = &tensor_backend_id(node);
1416
+ if (*node_backend_id == -1) {
1417
+ // unassigned node: find the backend with the most supported inputs
1418
+ int n_supported_best = -1;
1419
+ for (int b = 0; b < sched->n_backends; b++) {
1420
+ if (ggml_backend_supports_op(sched->backends[b], node)) {
1421
+ int n_supported = 0;
1422
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1423
+ struct ggml_tensor * src = node->src[j];
1424
+ if (src == NULL) {
1425
+ continue;
1426
+ }
1427
+ if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
1428
+ n_supported++;
1429
+ }
1430
+ }
1431
+ if (n_supported > n_supported_best) {
1432
+ n_supported_best = n_supported;
1433
+ *node_backend_id = b;
1434
+ SET_CAUSE(node, "3.best");
1435
+ }
1436
+ }
1437
+ }
1438
+ } else {
1439
+ // assigned node: upgrade to higher prio backend if possible
1440
+ for (int b = 0; b < *node_backend_id; b++) {
1441
+ if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
1442
+ bool supported = true;
1443
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1444
+ struct ggml_tensor * src = node->src[j];
1445
+ if (src == NULL) {
1446
+ continue;
1447
+ }
1448
+ if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
1449
+ supported = false;
1450
+ break;
1451
+ }
1452
+ }
1453
+ if (supported) {
1454
+ *node_backend_id = b;
1455
+ SET_CAUSE(node, "3.upg");
1456
+ break;
1457
+ }
1458
+ }
1459
+ }
1460
+ }
1461
+ }
1375
1462
 
1376
- // pass 3: assign backends to remaining src from dst and view_src
1463
+ // pass 4: assign backends to remaining src from dst and view_src
1377
1464
  for (int i = 0; i < graph->n_nodes; i++) {
1378
1465
  struct ggml_tensor * node = graph->nodes[i];
1379
1466
  int * cur_backend_id = &tensor_backend_id(node);
1380
1467
  if (node->view_src != NULL && *cur_backend_id == -1) {
1381
1468
  *cur_backend_id = tensor_backend_id(node->view_src);
1382
- SET_CAUSE(node, "3.vsrc");
1469
+ SET_CAUSE(node, "4.vsrc");
1383
1470
  }
1384
1471
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1385
1472
  struct ggml_tensor * src = node->src[j];
@@ -1391,24 +1478,22 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1391
1478
  if (src->view_src != NULL) {
1392
1479
  // views are always on the same backend as the source
1393
1480
  *src_backend_id = tensor_backend_id(src->view_src);
1394
- SET_CAUSE(src, "3.vsrc");
1481
+ SET_CAUSE(src, "4.vsrc");
1395
1482
  } else {
1396
1483
  *src_backend_id = *cur_backend_id;
1397
- SET_CAUSE(src, "3.cur");
1484
+ SET_CAUSE(src, "4.cur");
1398
1485
  }
1399
1486
  }
1400
1487
  }
1401
1488
  }
1402
- #ifdef DEBUG_PASS3
1403
- fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1404
- #endif
1405
1489
 
1406
- // pass 4: split graph, find tensors that need to be copied
1490
+ // pass 5: split graph, find tensors that need to be copied
1407
1491
  {
1408
1492
  int i_split = 0;
1409
1493
  struct ggml_backend_sched_split * split = &sched->splits[0];
1410
1494
  // find the backend of the first split, skipping view ops
1411
- for (int i = 0; i < graph->n_nodes; i++) {
1495
+ int i = 0;
1496
+ for (; i < graph->n_nodes; i++) {
1412
1497
  struct ggml_tensor * node = graph->nodes[i];
1413
1498
  if (!ggml_is_view_op(node->op)) {
1414
1499
  split->backend_id = tensor_backend_id(node);
@@ -1417,9 +1502,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1417
1502
  }
1418
1503
  split->i_start = 0;
1419
1504
  split->n_inputs = 0;
1420
- memset(split->inputs, 0, sizeof(split->inputs)); //HACK
1421
1505
  int cur_backend_id = split->backend_id;
1422
- for (int i = 0; i < graph->n_nodes; i++) {
1506
+ for (; i < graph->n_nodes; i++) {
1423
1507
  struct ggml_tensor * node = graph->nodes[i];
1424
1508
 
1425
1509
  if (ggml_is_view_op(node->op)) {
@@ -1428,7 +1512,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1428
1512
 
1429
1513
  const int node_backend_id = tensor_backend_id(node);
1430
1514
 
1431
- GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
1515
+ assert(node_backend_id != -1); // all nodes should be assigned by now
1432
1516
 
1433
1517
  // check if we should start a new split based on the sources of the current node
1434
1518
  bool need_new_split = false;
@@ -1442,16 +1526,18 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1442
1526
  // by starting a new split, the memory of the previously offloaded weights can be reused
1443
1527
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1444
1528
  int src_backend_id = tensor_backend_id(src);
1445
- if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
1529
+ if (src_backend_id != cur_backend_id) {
1446
1530
  need_new_split = true;
1447
1531
  break;
1448
1532
  }
1449
1533
  }
1450
1534
  // check if the split has too many inputs
1535
+ // FIXME: count the number of inputs instead of only checking when full
1451
1536
  if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1452
1537
  const size_t id = hash_id(src);
1453
- int src_backend_id = sched->tensor_backend_id[id];
1454
- if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
1538
+ int src_backend_id = sched->hv_tensor_backend_ids[id];
1539
+ bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1540
+ if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1455
1541
  //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1456
1542
  need_new_split = true;
1457
1543
  break;
@@ -1483,12 +1569,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1483
1569
  continue;
1484
1570
  }
1485
1571
 
1486
- const int src_backend_id = tensor_backend_id(src);
1572
+ size_t src_id = hash_id(src);
1573
+ const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1487
1574
  assert(src_backend_id != -1); // all inputs should be assigned by now
1488
1575
 
1489
- if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1490
- size_t id = hash_id(src);
1491
- if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1576
+ if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1577
+ if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
1492
1578
  ggml_backend_t backend = sched->backends[src_backend_id];
1493
1579
  for (int c = 0; c < sched->n_copies; c++) {
1494
1580
  struct ggml_tensor * tensor_copy;
@@ -1502,7 +1588,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1502
1588
  ggml_set_input(tensor_copy);
1503
1589
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1504
1590
  }
1505
- sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1591
+ tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1506
1592
  SET_CAUSE(tensor_copy, "4.cpy");
1507
1593
  }
1508
1594
  int n_graph_inputs = sched->n_graph_inputs++;
@@ -1511,10 +1597,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1511
1597
  }
1512
1598
  }
1513
1599
 
1514
- if (src_backend_id != node_backend_id) {
1600
+ if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1515
1601
  // create a copy of the input in the split's backend
1516
- const size_t id = hash_id(src);
1517
- if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1602
+ if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1518
1603
  ggml_backend_t backend = sched->backends[cur_backend_id];
1519
1604
  for (int c = 0; c < sched->n_copies; c++) {
1520
1605
  struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
@@ -1523,27 +1608,49 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1523
1608
  ggml_set_input(tensor_copy);
1524
1609
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1525
1610
  }
1526
- sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1611
+ tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1527
1612
  SET_CAUSE(tensor_copy, "4.cpy");
1528
1613
  }
1529
1614
  int n_inputs = split->n_inputs++;
1530
1615
  GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1531
1616
  split->inputs[n_inputs] = src;
1532
1617
  }
1533
- node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1618
+ node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1534
1619
  }
1535
1620
  }
1536
1621
  }
1537
1622
  split->i_end = graph->n_nodes;
1538
1623
  sched->n_splits = i_split + 1;
1539
1624
  }
1540
- #ifdef DEBUG_PASS4
1541
- fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1542
- #endif
1543
1625
 
1544
- // create copies of the graph for each split
1545
- // TODO: avoid this copy
1546
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
1626
+ if (sched->debug) {
1627
+ ggml_backend_sched_print_assignments(sched, graph);
1628
+ }
1629
+
1630
+ // swap node_backend_ids and leaf _backend_ids with prevs
1631
+ {
1632
+ int * tmp = sched->node_backend_ids;
1633
+ sched->node_backend_ids = sched->prev_node_backend_ids;
1634
+ sched->prev_node_backend_ids = tmp;
1635
+
1636
+ tmp = sched->leaf_backend_ids;
1637
+ sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1638
+ sched->prev_leaf_backend_ids = tmp;
1639
+ }
1640
+
1641
+ int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1642
+ if (sched->graph.size < graph_size) {
1643
+ sched->graph.size = graph_size;
1644
+ sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
1645
+ sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
1646
+ GGML_ASSERT(sched->graph.nodes != NULL);
1647
+ GGML_ASSERT(sched->graph.leafs != NULL);
1648
+ }
1649
+ sched->graph.n_nodes = 0;
1650
+ sched->graph.n_leafs = 0;
1651
+
1652
+ struct ggml_cgraph * graph_copy = &sched->graph;
1653
+
1547
1654
  for (int i = 0; i < sched->n_splits; i++) {
1548
1655
  struct ggml_backend_sched_split * split = &sched->splits[i];
1549
1656
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
@@ -1554,12 +1661,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1554
1661
 
1555
1662
  struct ggml_tensor * input = split->inputs[j];
1556
1663
  const size_t input_id = hash_id(input);
1557
- struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
1664
+ struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
1558
1665
 
1559
1666
  // add a dependency to the input source so that it is not freed before the copy is done
1560
1667
  struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1561
1668
  input_dep->src[0] = input;
1562
- sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
1669
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
1563
1670
  graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1564
1671
 
1565
1672
  // add a dependency to the input copy so that it is allocated at the start of the split
@@ -1581,7 +1688,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1581
1688
  size_t id = hash_id(input);
1582
1689
  int backend_id = tensor_backend_id(input);
1583
1690
  for (int c = 0; c < sched->n_copies; c++) {
1584
- struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1691
+ struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1585
1692
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1586
1693
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1587
1694
  }
@@ -1594,7 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1594
1701
  struct ggml_tensor * input = split->inputs[j];
1595
1702
  size_t id = hash_id(input);
1596
1703
  for (int c = 0; c < sched->n_copies; c++) {
1597
- struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1704
+ struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1598
1705
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1599
1706
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1600
1707
  }
@@ -1608,20 +1715,36 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1608
1715
  sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1609
1716
  graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1610
1717
  }
1611
-
1612
- sched->graph = graph_copy;
1613
1718
  }
1614
1719
 
1615
1720
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1721
+ bool backend_ids_changed = false;
1722
+ for (int i = 0; i < sched->graph.n_nodes; i++) {
1723
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1724
+ sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1725
+ backend_ids_changed = true;
1726
+ break;
1727
+ }
1728
+ }
1729
+ if (!backend_ids_changed) {
1730
+ for (int i = 0; i < sched->graph.n_leafs; i++) {
1731
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1732
+ sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1733
+ backend_ids_changed = true;
1734
+ break;
1735
+ }
1736
+ }
1737
+ }
1738
+
1616
1739
  // allocate graph
1617
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1740
+ if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1618
1741
  // the re-allocation may cause the split inputs to be moved to a different address
1619
1742
  ggml_backend_sched_synchronize(sched);
1620
1743
  #ifndef NDEBUG
1621
- fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
1744
+ fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1622
1745
  #endif
1623
- ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1624
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1746
+ ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1747
+ if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1625
1748
  fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1626
1749
  return false;
1627
1750
  }
@@ -1642,7 +1765,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1642
1765
  for (int j = 0; j < split->n_inputs; j++) {
1643
1766
  ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1644
1767
  struct ggml_tensor * input = split->inputs[j];
1645
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
1768
+ struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
1646
1769
 
1647
1770
  if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1648
1771
  // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
@@ -1727,18 +1850,24 @@ ggml_backend_sched_t ggml_backend_sched_new(
1727
1850
 
1728
1851
  struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1729
1852
 
1853
+ sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
1854
+ sched->n_backends = n_backends;
1855
+ sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1856
+
1730
1857
  // initialize hash table
1731
- sched->hash_set = ggml_hash_set_new(graph_size);
1732
- sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
1733
- sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
1858
+ // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
1859
+ sched->hash_set = ggml_hash_set_new(graph_size);
1860
+ sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1861
+ sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
1734
1862
 
1735
1863
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1736
- sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1737
- sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1738
-
1739
- sched->n_backends = n_backends;
1864
+ sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1865
+ sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1866
+ sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1867
+ sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1740
1868
 
1741
- sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1869
+ sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
1870
+ sched->context_buffer = malloc(sched->context_buffer_size);
1742
1871
 
1743
1872
  const int initial_splits_capacity = 16;
1744
1873
  sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
@@ -1747,7 +1876,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
1747
1876
  for (int b = 0; b < n_backends; b++) {
1748
1877
  sched->backends[b] = backends[b];
1749
1878
  sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1750
- GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
1879
+ GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1751
1880
  if (sched->n_copies > 1) {
1752
1881
  for (int c = 0; c < sched->n_copies; c++) {
1753
1882
  sched->events[b][c] = ggml_backend_event_new(backends[b]);
@@ -1773,35 +1902,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1773
1902
  }
1774
1903
  ggml_gallocr_free(sched->galloc);
1775
1904
  ggml_free(sched->ctx);
1905
+ ggml_hash_set_free(&sched->hash_set);
1776
1906
  free(sched->splits);
1777
- free(sched->hash_set.keys);
1778
- free(sched->tensor_backend_id);
1779
- free(sched->tensor_copies);
1907
+ free(sched->hv_tensor_backend_ids);
1908
+ free(sched->hv_tensor_copies);
1780
1909
  free(sched->node_backend_ids);
1781
1910
  free(sched->leaf_backend_ids);
1911
+ free(sched->prev_node_backend_ids);
1912
+ free(sched->prev_leaf_backend_ids);
1913
+ free(sched->context_buffer);
1914
+ free(sched->graph.nodes);
1915
+ free(sched->graph.leafs);
1782
1916
  free(sched);
1783
1917
  }
1784
1918
 
1785
1919
  void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1786
1920
  // reset state for the next run
1787
1921
  if (!sched->is_reset) {
1788
- size_t hash_size = sched->hash_set.size;
1789
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1790
- memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1791
- memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1792
-
1922
+ ggml_hash_set_reset(&sched->hash_set);
1923
+ memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1924
+ memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
1793
1925
  sched->is_reset = true;
1794
1926
  }
1795
1927
  sched->is_alloc = false;
1796
1928
  }
1797
1929
 
1798
1930
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1799
- GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
1931
+ GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1800
1932
 
1801
1933
  ggml_backend_sched_split_graph(sched, measure_graph);
1802
1934
 
1803
- // TODO: extract this to a separate function
1804
- if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1935
+ if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1805
1936
  return false;
1806
1937
  }
1807
1938
 
@@ -1812,10 +1943,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1812
1943
  }
1813
1944
 
1814
1945
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1815
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
1946
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1816
1947
 
1817
1948
  ggml_backend_sched_split_graph(sched, graph);
1818
1949
 
1950
+
1819
1951
  if (!ggml_backend_sched_alloc_splits(sched)) {
1820
1952
  return false;
1821
1953
  }
@@ -1864,6 +1996,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
1864
1996
  return sched->n_copies;
1865
1997
  }
1866
1998
 
1999
+ int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
2000
+ return sched->n_backends;
2001
+ }
2002
+
2003
+ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
2004
+ GGML_ASSERT(i >= 0 && i < sched->n_backends);
2005
+ return sched->backends[i];
2006
+ }
2007
+
1867
2008
  size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1868
2009
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1869
2010
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
@@ -1875,6 +2016,8 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
1875
2016
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1876
2017
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1877
2018
  tensor_backend_id(node) = backend_index;
2019
+ SET_CAUSE(node, "usr");
2020
+ sched->is_reset = false;
1878
2021
  }
1879
2022
 
1880
2023
  ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
@@ -1887,15 +2030,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
1887
2030
 
1888
2031
  // utils
1889
2032
 
1890
- void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
2033
+ void ggml_backend_view_init(struct ggml_tensor * tensor) {
1891
2034
  GGML_ASSERT(tensor->buffer == NULL);
1892
2035
  GGML_ASSERT(tensor->view_src != NULL);
1893
2036
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1894
2037
  GGML_ASSERT(tensor->view_src->data != NULL);
1895
2038
 
1896
- tensor->buffer = buffer;
2039
+ tensor->buffer = tensor->view_src->buffer;
1897
2040
  tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1898
- ggml_backend_buffer_init_tensor(buffer, tensor);
2041
+ ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
1899
2042
  }
1900
2043
 
1901
2044
  void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
@@ -1917,9 +2060,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
1917
2060
  GGML_ASSERT(src != NULL);
1918
2061
  GGML_ASSERT(src->data && "graph must be allocated");
1919
2062
 
1920
- size_t id = ggml_hash_insert(hash_set, src);
1921
- if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
1922
- return node_copies[ggml_hash_find(hash_set, src)];
2063
+ size_t id = ggml_hash_insert(&hash_set, src);
2064
+ if (id == GGML_HASHSET_ALREADY_EXISTS) {
2065
+ return node_copies[ggml_hash_find(&hash_set, src)];
1923
2066
  }
1924
2067
 
1925
2068
  struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
@@ -1944,7 +2087,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
1944
2087
  return dst;
1945
2088
  }
1946
2089
 
1947
- static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
2090
+ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
1948
2091
  size_t id = ggml_hash_find(hash_set, src);
1949
2092
  if (node_init[id]) {
1950
2093
  return;
@@ -1954,7 +2097,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1954
2097
  struct ggml_tensor * dst = node_copies[id];
1955
2098
  if (dst->view_src != NULL) {
1956
2099
  graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1957
- ggml_backend_view_init(dst->view_src->buffer, dst);
2100
+ ggml_backend_view_init(dst);
1958
2101
  }
1959
2102
  else {
1960
2103
  ggml_backend_tensor_copy(src, dst);
@@ -1971,10 +2114,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1971
2114
  }
1972
2115
 
1973
2116
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1974
- struct ggml_hash_set hash_set = {
1975
- /* .size = */ graph->visited_hash_table.size,
1976
- /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
1977
- };
2117
+ struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
1978
2118
  struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1979
2119
  bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
1980
2120
 
@@ -1989,7 +2129,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1989
2129
 
1990
2130
  if (ctx_allocated == NULL || ctx_unallocated == NULL) {
1991
2131
  fprintf(stderr, "failed to allocate context for graph copy\n");
1992
- free(hash_set.keys);
2132
+ ggml_hash_set_free(&hash_set);
1993
2133
  free(node_copies);
1994
2134
  free(node_init);
1995
2135
  ggml_free(ctx_allocated);
@@ -2012,7 +2152,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2012
2152
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2013
2153
  if (buffer == NULL) {
2014
2154
  fprintf(stderr, "failed to allocate buffer for graph copy\n");
2015
- free(hash_set.keys);
2155
+ ggml_hash_set_free(&hash_set);
2016
2156
  free(node_copies);
2017
2157
  free(node_init);
2018
2158
  ggml_free(ctx_allocated);
@@ -2030,19 +2170,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2030
2170
  // copy data and init views
2031
2171
  for (int i = 0; i < graph->n_nodes; i++) {
2032
2172
  struct ggml_tensor * node = graph->nodes[i];
2033
- graph_copy_init_tensor(hash_set, node_copies, node_init, node);
2173
+ graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
2034
2174
  }
2035
2175
 
2036
2176
  // build graph copy
2037
2177
  struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
2038
2178
  for (int i = 0; i < graph->n_nodes; i++) {
2039
2179
  struct ggml_tensor * node = graph->nodes[i];
2040
- struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
2180
+ struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
2041
2181
  graph_copy->nodes[i] = node_copy;
2042
2182
  }
2043
2183
  graph_copy->n_nodes = graph->n_nodes;
2044
2184
 
2045
- free(hash_set.keys);
2185
+ ggml_hash_set_free(&hash_set);
2046
2186
  free(node_copies);
2047
2187
  free(node_init);
2048
2188