@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -0,0 +1,39 @@
1
+ // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
2
+ #pragma once
3
+
4
+ #define GGML_COMMON_DECL_C
5
+ #include "ggml-common.h"
6
+
7
+ #include "ggml.h"
8
+
9
+ // GGML internal header
10
+
11
+ #ifdef __cplusplus
12
+ extern "C" {
13
+ #endif
14
+
15
+ // Quantization
16
+ void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
17
+ void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
18
+
19
+ void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
20
+
21
+ // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
22
+ size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
23
+ size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
24
+ size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
25
+
26
+ // GEMV
27
+ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
28
+ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
29
+ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
30
+
31
+ // GEMM
32
+ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
33
+ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
34
+ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
35
+
36
+ #ifdef __cplusplus
37
+ }
38
+ #endif
39
+
@@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
91
91
  if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
92
92
  fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
93
  __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
- GGML_ASSERT(!"not enough space in the buffer");
95
- return;
94
+ GGML_ABORT("not enough space in the buffer");
96
95
  }
97
96
 
98
97
  void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
133
132
  return;
134
133
  }
135
134
  }
136
- GGML_ASSERT(!"out of allocated_tensors");
135
+ GGML_ABORT("out of allocated_tensors");
137
136
  }
138
137
  static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
139
138
  for (int i = 0; i < 1024; i++) {
@@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
142
141
  return;
143
142
  }
144
143
  }
145
- fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
146
- GGML_ASSERT(!"tensor not found");
144
+ GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
147
145
  }
148
146
  #endif
149
147
 
@@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
176
174
  // this should never happen
177
175
  fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
178
176
  __func__, size, max_avail);
179
- GGML_ASSERT(!"not enough space in the buffer");
180
- GGML_UNREACHABLE();
177
+ GGML_ABORT("not enough space in the buffer");
181
178
  }
182
179
  }
183
180
 
@@ -339,6 +336,7 @@ struct hash_node {
339
336
  };
340
337
 
341
338
  struct tensor_alloc {
339
+ int buffer_id;
342
340
  size_t offset;
343
341
  size_t size_max; // 0 = pre-allocated, unused, or view
344
342
  };
@@ -349,7 +347,6 @@ struct leaf_alloc {
349
347
  };
350
348
 
351
349
  struct node_alloc {
352
- int buffer_id;
353
350
  struct tensor_alloc dst;
354
351
  struct tensor_alloc src[GGML_MAX_SRC];
355
352
  };
@@ -377,7 +374,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
377
374
  galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
378
375
  GGML_ASSERT(galloc->bufts != NULL);
379
376
 
380
- galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
377
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
381
378
  GGML_ASSERT(galloc->buffers != NULL);
382
379
 
383
380
  galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
@@ -386,8 +383,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
386
383
  for (int i = 0; i < n_bufs; i++) {
387
384
  galloc->bufts[i] = bufts[i];
388
385
  galloc->buffers[i] = NULL;
389
- size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
390
- galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
386
+
387
+ // check if the same buffer type is used multiple times and reuse the same allocator
388
+ for (int j = 0; j < i; j++) {
389
+ if (bufts[i] == bufts[j]) {
390
+ galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
391
+ break;
392
+ }
393
+ }
394
+
395
+ if (galloc->buf_tallocs[i] == NULL) {
396
+ size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
397
+ galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
398
+ }
391
399
  }
392
400
  galloc->n_buffers = n_bufs;
393
401
 
@@ -405,14 +413,34 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
405
413
 
406
414
  for (int i = 0; i < galloc->n_buffers; i++) {
407
415
  if (galloc->buffers != NULL) {
408
- ggml_backend_buffer_free(galloc->buffers[i]);
416
+ // skip if already freed
417
+ bool freed = false;
418
+ for (int j = 0; j < i; j++) {
419
+ if (galloc->buffers[j] == galloc->buffers[i]) {
420
+ freed = true;
421
+ break;
422
+ }
423
+ }
424
+ if (!freed) {
425
+ ggml_backend_buffer_free(galloc->buffers[i]);
426
+ }
409
427
  }
410
428
  if (galloc->buf_tallocs != NULL) {
411
- ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
429
+ // skip if already freed
430
+ bool freed = false;
431
+ for (int j = 0; j < i; j++) {
432
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
433
+ freed = true;
434
+ break;
435
+ }
436
+ }
437
+ if (!freed) {
438
+ ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
439
+ }
412
440
  }
413
441
  }
414
442
 
415
- free(galloc->hash_set.keys);
443
+ ggml_hash_set_free(&galloc->hash_set);
416
444
  free(galloc->hash_values);
417
445
  free(galloc->bufts);
418
446
  free(galloc->buffers);
@@ -425,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
425
453
  typedef struct ggml_gallocr * ggml_gallocr_t;
426
454
 
427
455
  static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
428
- size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
456
+ size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
429
457
  return &galloc->hash_values[i];
430
458
  }
431
459
 
@@ -511,17 +539,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
511
539
  }
512
540
  }
513
541
 
514
- static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
542
+ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
515
543
  // graph outputs are never freed
516
544
  if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
517
545
  AT_PRINTF("not freeing output %s\n", node->name);
518
546
  return;
519
547
  }
520
548
 
521
- struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
522
- ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
523
549
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
524
550
  size_t offset = hn->offset;
551
+ int buffer_id = hn->buffer_id;
552
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
553
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
525
554
  size_t size = ggml_backend_buft_get_alloc_size(buft, node);
526
555
  ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
527
556
  hn->allocated = false;
@@ -533,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
533
562
 
534
563
  static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
535
564
  // clear hash tables
536
- memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
537
- memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
565
+ ggml_hash_set_reset(&galloc->hash_set);
566
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
538
567
 
539
568
  // allocate leafs
540
569
  // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
@@ -626,11 +655,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
626
655
  AT_PRINTF("view_src %s: %d children, %d views\n",
627
656
  view_src->name, view_src_hn->n_children, view_src_hn->n_views);
628
657
  if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
629
- ggml_gallocr_free_node(galloc, view_src, buffer_id);
658
+ ggml_gallocr_free_node(galloc, view_src);
630
659
  }
631
660
  }
632
661
  else if (p_hn->allocated) {
633
- ggml_gallocr_free_node(galloc, parent, buffer_id);
662
+ ggml_gallocr_free_node(galloc, parent);
634
663
  }
635
664
  }
636
665
  AT_PRINTF("\n");
@@ -639,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
639
668
  }
640
669
 
641
670
  bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
642
- size_t hash_size = graph->visited_hash_table.size;
671
+ size_t min_hash_size = graph->n_nodes + graph->n_leafs;
672
+ // add 25% margin to avoid hash collisions
673
+ min_hash_size += min_hash_size / 4;
643
674
 
644
675
  // initialize hash table
645
- if (galloc->hash_set.size < hash_size) {
646
- free(galloc->hash_set.keys);
647
- free(galloc->hash_values);
648
- galloc->hash_set.size = hash_size;
649
- galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
650
- galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
676
+ if (galloc->hash_set.size < min_hash_size) {
677
+ ggml_hash_set_free(&galloc->hash_set);
678
+ galloc->hash_set = ggml_hash_set_new(min_hash_size);
651
679
  GGML_ASSERT(galloc->hash_set.keys != NULL);
680
+
681
+ free(galloc->hash_values);
682
+ galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
652
683
  GGML_ASSERT(galloc->hash_values != NULL);
653
- } else {
654
- // reset hash table
655
- memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
656
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
657
684
  }
658
685
 
659
686
  // reset allocators
@@ -674,22 +701,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
674
701
  for (int i = 0; i < graph->n_nodes; i++) {
675
702
  struct ggml_tensor * node = graph->nodes[i];
676
703
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
677
- node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
678
704
  if (node->view_src || node->data) {
705
+ node_alloc->dst.buffer_id = -1;
679
706
  node_alloc->dst.offset = SIZE_MAX;
680
707
  node_alloc->dst.size_max = 0;
681
708
  } else {
682
709
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
683
- node_alloc->dst.offset = hn->offset;
684
- node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
710
+ node_alloc->dst.buffer_id = hn->buffer_id;
711
+ node_alloc->dst.offset = hn->offset;
712
+ node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
685
713
  }
686
714
  for (int j = 0; j < GGML_MAX_SRC; j++) {
687
715
  struct ggml_tensor * src = node->src[j];
688
716
  if (!src || src->view_src || src->data) {
717
+ node_alloc->src[j].buffer_id = -1;
689
718
  node_alloc->src[j].offset = SIZE_MAX;
690
719
  node_alloc->src[j].size_max = 0;
691
720
  } else {
692
721
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
722
+ node_alloc->src[j].buffer_id = hn->buffer_id;
693
723
  node_alloc->src[j].offset = hn->offset;
694
724
  node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
695
725
  }
@@ -706,9 +736,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
706
736
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
707
737
  galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
708
738
  if (leaf->view_src || leaf->data) {
739
+ galloc->leaf_allocs[i].leaf.buffer_id = -1;
709
740
  galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
710
741
  galloc->leaf_allocs[i].leaf.size_max = 0;
711
742
  } else {
743
+ galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
712
744
  galloc->leaf_allocs[i].leaf.offset = hn->offset;
713
745
  galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
714
746
  }
@@ -716,6 +748,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
716
748
 
717
749
  // reallocate buffers if needed
718
750
  for (int i = 0; i < galloc->n_buffers; i++) {
751
+ // if the buffer type is used multiple times, we reuse the same buffer
752
+ for (int j = 0; j < i; j++) {
753
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
754
+ galloc->buffers[i] = galloc->buffers[j];
755
+ break;
756
+ }
757
+ }
758
+
719
759
  size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
720
760
  size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
721
761
 
@@ -724,12 +764,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
724
764
  #ifndef NDEBUG
725
765
  fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
726
766
  #endif
767
+
727
768
  ggml_backend_buffer_free(galloc->buffers[i]);
728
769
  galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
729
770
  if (galloc->buffers[i] == NULL) {
730
771
  fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
731
772
  return false;
732
773
  }
774
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
733
775
  }
734
776
  }
735
777
 
@@ -740,7 +782,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
740
782
  return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
741
783
  }
742
784
 
743
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
785
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
786
+ int buffer_id = tensor_alloc->buffer_id;
744
787
  assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
745
788
 
746
789
  if (tensor->view_src != NULL) {
@@ -750,7 +793,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
750
793
  // this tensor was allocated without ggml-backend
751
794
  return;
752
795
  }
753
- ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
796
+ ggml_backend_view_init(tensor);
754
797
  }
755
798
  } else {
756
799
  if (tensor->data == NULL) {
@@ -768,9 +811,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
768
811
  }
769
812
  }
770
813
 
771
- static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
772
- ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
773
- size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
814
+ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
815
+ size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
774
816
  return talloc->size_max >= node_size;
775
817
  }
776
818
 
@@ -793,7 +835,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
793
835
  struct ggml_tensor * node = graph->nodes[i];
794
836
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
795
837
 
796
- if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
838
+ if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
797
839
  #ifndef NDEBUG
798
840
  fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
799
841
  #endif
@@ -805,7 +847,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
805
847
  if (src == NULL) {
806
848
  continue;
807
849
  }
808
- if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
850
+ if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
809
851
  #ifndef NDEBUG
810
852
  fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
811
853
  #endif
@@ -846,7 +888,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
846
888
  for (int i = 0; i < graph->n_leafs; i++) {
847
889
  struct ggml_tensor * leaf = graph->leafs[i];
848
890
  struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
849
- ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
891
+ ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
850
892
  }
851
893
  // nodes
852
894
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -857,9 +899,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
857
899
  if (src == NULL) {
858
900
  continue;
859
901
  }
860
- ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
902
+ ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
861
903
  }
862
- ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
904
+ ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
863
905
  }
864
906
 
865
907
  return true;
@@ -871,6 +913,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
871
913
  if (galloc->buffers[buffer_id] == NULL) {
872
914
  return 0;
873
915
  }
916
+
917
+ for (int i = 0; i < buffer_id; i++) {
918
+ if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
919
+ // this buffer is the same as a previous one due to the same buffer type being used multiple times
920
+ // only return the buffer size the first time it appears to avoid double counting
921
+ return 0;
922
+ }
923
+ }
924
+
874
925
  return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
875
926
  }
876
927
 
@@ -886,7 +937,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
886
937
  fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
887
938
  #endif
888
939
  for (size_t i = 0; i < *n_buffers; i++) {
889
- ggml_backend_buffer_free(*buffers[i]);
940
+ ggml_backend_buffer_free((*buffers)[i]);
890
941
  }
891
942
  free(*buffers);
892
943
  return false;
@@ -899,12 +950,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
899
950
  if (t->view_src == NULL) {
900
951
  ggml_tallocr_alloc(&tallocr, t);
901
952
  } else if (t->buffer == NULL) {
902
- ggml_backend_view_init(buffer, t);
953
+ ggml_backend_view_init(t);
903
954
  }
904
955
  } else {
905
956
  if (t->view_src != NULL && t->buffer == NULL) {
906
957
  // view of a pre-allocated tensor
907
- ggml_backend_view_init(buffer, t);
958
+ ggml_backend_view_init(t);
908
959
  }
909
960
  }
910
961
  }
@@ -17,13 +17,15 @@ extern "C" {
17
17
 
18
18
  struct ggml_backend_buffer_type_i {
19
19
  const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
+ // allocate a buffer of this type
20
21
  ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
- size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
- size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
23
- size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
24
- bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
22
+ // tensor alignment
23
+ size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
24
+ // max buffer size that can be allocated
25
+ size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
26
+ // data size needed to allocate the tensor, including padding
27
+ size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25
28
  // check if tensor data is in host memory
26
- // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
27
29
  bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
28
30
  };
29
31
 
@@ -92,27 +94,37 @@ extern "C" {
92
94
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
95
 
94
96
  // compute graph with a plan (not used currently)
97
+ // create a new plan for a graph
95
98
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
99
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100
+ // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
101
+ void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
102
+ // compute the graph with the plan
103
+ enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
97
104
 
98
- // compute graph with a plan
99
- enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100
105
  // compute graph without a plan (async)
101
106
  enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
102
107
 
103
- // check if the backend supports an operation
108
+ // check if the backend can compute an operation
104
109
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
110
 
111
+ // check if the backend can use tensors allocated in a buffer type
112
+ bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
113
+
106
114
  // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
107
115
  // these should be expensive operations with large batch sizes that may benefit from running on this backend
108
116
  // even if the weight has to be copied from the CPU temporarily
109
117
  bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110
118
 
111
119
  // (optional) event synchronization
120
+ // create a new event that can record events on this backend instance
112
121
  ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
113
122
  void (*GGML_CALL event_free) (ggml_backend_event_t event);
123
+ // record an event on the backend instance that created it
114
124
  void (*GGML_CALL event_record) (ggml_backend_event_t event);
125
+ // wait for an event on on a different backend instance
115
126
  void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
127
+ // block until an event is recorded
116
128
  void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
117
129
  };
118
130