@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -32,7 +32,15 @@ else()
32
32
  endif()
33
33
  endif()
34
34
 
35
+ # remove the lib prefix on win32 mingw
36
+ if (WIN32)
37
+ set(CMAKE_STATIC_LIBRARY_PREFIX "")
38
+ set(CMAKE_SHARED_LIBRARY_PREFIX "")
39
+ set(CMAKE_SHARED_MODULE_PREFIX "")
40
+ endif()
41
+
35
42
  option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
43
+ option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
36
44
 
37
45
  #
38
46
  # option list
@@ -66,10 +74,10 @@ if (NOT GGML_CUDA_GRAPHS_DEFAULT)
66
74
  endif()
67
75
 
68
76
  # general
69
- option(GGML_STATIC "ggml: static link libraries" OFF)
70
- option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
71
- option(GGML_LTO "ggml: enable link time optimization" OFF)
72
- option(GGML_CCACHE "ggml: use ccache if available" ON)
77
+ option(GGML_STATIC "ggml: static link libraries" OFF)
78
+ option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
79
+ option(GGML_LTO "ggml: enable link time optimization" OFF)
80
+ option(GGML_CCACHE "ggml: use ccache if available" ON)
73
81
 
74
82
  # debug
75
83
  option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
@@ -91,28 +99,39 @@ else()
91
99
  set(INS_ENB ON)
92
100
  endif()
93
101
 
94
- option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
95
-
96
- option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
97
- option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
98
- option(GGML_AVX512 "ggml: enable AVX512" OFF)
99
- option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
100
- option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
101
- option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
102
- option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
102
+ option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
103
+ option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
104
+ option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
105
+ option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
106
+ option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
107
+ option(GGML_AVX512 "ggml: enable AVX512F" OFF)
108
+ option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
109
+ option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
110
+ option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
103
111
  if (NOT MSVC)
104
- option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
112
+ # in MSVC F16C and FMA is implied with AVX2/AVX512
113
+ option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
114
+ option(GGML_F16C "ggml: enable F16C" ${INS_ENB})
115
+ # MSVC does not seem to support AMX
116
+ option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
117
+ option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
118
+ option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
105
119
  endif()
106
- option(GGML_LASX "ggml: enable lasx" ON)
107
- option(GGML_LSX "ggml: enable lsx" ON)
108
- option(GGML_SVE "ggml: enable SVE" OFF)
120
+ option(GGML_LASX "ggml: enable lasx" ON)
121
+ option(GGML_LSX "ggml: enable lsx" ON)
122
+ option(GGML_RVV "ggml: enable rvv" ON)
123
+
124
+ option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
125
+ set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
126
+
109
127
 
110
128
  if (WIN32)
111
- set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
129
+ set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
112
130
  endif()
113
131
 
114
132
  # ggml core
115
133
  set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
134
+ option(GGML_CPU "ggml: enable CPU backend" ON)
116
135
 
117
136
  # 3rd party libs / backends
118
137
  option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
@@ -123,14 +142,9 @@ option(GGML_LLAMAFILE "ggml: use LLAMAFILE"
123
142
 
124
143
  option(GGML_CUDA "ggml: use CUDA" OFF)
125
144
  option(GGML_MUSA "ggml: use MUSA" OFF)
126
- option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
127
145
  option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
128
146
  option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
129
- set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
130
- set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
131
147
  option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
132
- set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
133
- "ggml: iters./thread per block for Q2_K/Q6_K")
134
148
  set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
135
149
  "ggml: max. batch size for using peer access")
136
150
  option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
@@ -138,7 +152,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
138
152
  option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
139
153
  option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
140
154
 
141
- option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
155
+ option(GGML_HIP "ggml: use HIP" OFF)
142
156
  option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
143
157
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
144
158
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
@@ -150,6 +164,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
150
164
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
151
165
  option(GGML_KOMPUTE "ggml: use Kompute" OFF)
152
166
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
167
+ option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
153
168
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
154
169
  option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
155
170
  option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
@@ -162,6 +177,13 @@ option(GGML_SYCL "ggml: use SYCL"
162
177
  option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
163
178
  set (GGML_SYCL_TARGET "INTEL" CACHE STRING
164
179
  "ggml: sycl target device")
180
+ set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
181
+ "ggml: sycl device architecture")
182
+
183
+ option(GGML_OPENCL "ggml: use OpenCL" OFF)
184
+ option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
185
+ option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
186
+ option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
165
187
 
166
188
  # extra artifacts
167
189
  option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
@@ -174,11 +196,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
174
196
  set(CMAKE_C_STANDARD 11)
175
197
  set(CMAKE_C_STANDARD_REQUIRED true)
176
198
 
177
- if (GGML_SYCL)
178
- set(CMAKE_CXX_STANDARD 17)
179
- else()
180
- set(CMAKE_CXX_STANDARD 11)
181
- endif()
199
+ set(CMAKE_CXX_STANDARD 17)
182
200
  set(CMAKE_CXX_STANDARD_REQUIRED true)
183
201
 
184
202
  set(THREADS_PREFER_PTHREAD_FLAG ON)
@@ -214,13 +232,14 @@ include(CMakePackageConfigHelpers)
214
232
  # all public headers
215
233
  set(GGML_PUBLIC_HEADERS
216
234
  include/ggml.h
235
+ include/ggml-cpu.h
217
236
  include/ggml-alloc.h
218
237
  include/ggml-backend.h
219
238
  include/ggml-blas.h
220
239
  include/ggml-cann.h
221
240
  include/ggml-cuda.h
222
- include/ggml.h
223
241
  include/ggml-kompute.h
242
+ include/ggml-opt.h
224
243
  include/ggml-metal.h
225
244
  include/ggml-rpc.h
226
245
  include/ggml-sycl.h
@@ -230,15 +249,14 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
230
249
  #if (GGML_METAL)
231
250
  # set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
232
251
  #endif()
233
- install(TARGETS ggml PUBLIC_HEADER)
234
-
235
- if (BUILD_SHARED_LIBS)
236
- install(TARGETS ggml LIBRARY)
237
- endif()
252
+ install(TARGETS ggml LIBRARY PUBLIC_HEADER)
253
+ install(TARGETS ggml-base LIBRARY)
238
254
 
255
+ # FIXME: this should be done in the backend cmake files
239
256
  if (GGML_METAL)
257
+ # FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
240
258
  install(
241
- FILES src/ggml-metal.metal
259
+ FILES src/ggml-metal/ggml-metal.metal
242
260
  PERMISSIONS
243
261
  OWNER_READ
244
262
  OWNER_WRITE
@@ -3,6 +3,20 @@
3
3
  #include "ggml.h"
4
4
  #include "ggml-alloc.h"
5
5
 
6
+ #ifdef GGML_BACKEND_SHARED
7
+ # if defined(_WIN32) && !defined(__MINGW32__)
8
+ # ifdef GGML_BACKEND_BUILD
9
+ # define GGML_BACKEND_API __declspec(dllexport) extern
10
+ # else
11
+ # define GGML_BACKEND_API __declspec(dllimport) extern
12
+ # endif
13
+ # else
14
+ # define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
15
+ # endif
16
+ #else
17
+ # define GGML_BACKEND_API extern
18
+ #endif
19
+
6
20
  #ifdef __cplusplus
7
21
  extern "C" {
8
22
  #endif
@@ -72,7 +86,7 @@ extern "C" {
72
86
  GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
73
87
  GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
74
88
 
75
- // "offset" refers to the offset of the tensor data for setting/getting data
89
+ // "offset" refers to the offset in tensor->data for setting/getting data
76
90
  GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
77
91
  GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
78
92
  GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
@@ -114,11 +128,12 @@ extern "C" {
114
128
  //
115
129
 
116
130
  enum ggml_backend_dev_type {
131
+ // CPU device using system memory
117
132
  GGML_BACKEND_DEVICE_TYPE_CPU,
133
+ // GPU device using dedicated memory
118
134
  GGML_BACKEND_DEVICE_TYPE_GPU,
119
- // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
120
- GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
121
- GGML_BACKEND_DEVICE_TYPE_GPU_FULL
135
+ // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
136
+ GGML_BACKEND_DEVICE_TYPE_ACCEL
122
137
  };
123
138
 
124
139
  // functionality supported by the device
@@ -127,6 +142,8 @@ extern "C" {
127
142
  bool async;
128
143
  // pinned host buffer
129
144
  bool host_buffer;
145
+ // creating buffers from host ptr
146
+ bool buffer_from_host_ptr;
130
147
  // event synchronization
131
148
  bool events;
132
149
  };
@@ -165,9 +182,22 @@ extern "C" {
165
182
  GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
166
183
  GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
167
184
 
168
-
169
- // Functions that may be obtained using ggml_backend_reg_get_proc_address
170
- typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
185
+ // Common functions that may be obtained using ggml_backend_reg_get_proc_address
186
+
187
+ // Split buffer type for tensor parallelism
188
+ typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
189
+ // Set the number of threads for the backend
190
+ typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
191
+ // Get additional buffer types provided by the device (returns a NULL-terminated array)
192
+ typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
193
+ // Set the abort callback for the backend
194
+ typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
195
+ // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
196
+ struct ggml_backend_feature {
197
+ const char * name;
198
+ const char * value;
199
+ };
200
+ typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
171
201
 
172
202
  //
173
203
  // Backend registry
@@ -189,9 +219,17 @@ extern "C" {
189
219
  GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
190
220
  // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
191
221
  GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
192
- // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
222
+ // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
193
223
  GGML_API ggml_backend_t ggml_backend_init_best(void);
194
224
 
225
+ // Load a backend from a dynamic library and register it
226
+ GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
227
+ // Unload a backend if loaded dynamically and unregister it
228
+ GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
229
+ // Load all known backends from dynamic libraries
230
+ GGML_API void ggml_backend_load_all(void);
231
+ GGML_API void ggml_backend_load_all_from_path(const char * dir_path);
232
+
195
233
  //
196
234
  // Backend scheduler
197
235
  //
@@ -220,14 +258,20 @@ extern "C" {
220
258
  ggml_backend_sched_reserve(sched, reserve_graph);
221
259
 
222
260
  // compute
223
- graph = build_graph(sched);
224
- ggml_backend_sched_graph_compute(sched, graph);
261
+ graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
262
+ for (int i = 0; i < 10; ++i) {
263
+ ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
264
+ }
225
265
 
226
266
  // if there are graph inputs:
227
- ggml_backend_sched_reset(sched);
228
- ggml_backend_sched_alloc_graph(sched, graph);
229
- ggml_backend_tensor_set(input_tensor, ...);
230
- ggml_backend_sched_graph_compute(sched, graph);
267
+ graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
268
+ ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
269
+ ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
270
+ ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
271
+ ggml_backend_sched_graph_compute(sched, graph); // execute the graph
272
+
273
+ // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
274
+ // allocate them statically via ggml_backend_alloc_ctx_tensors
231
275
  }
232
276
  */
233
277
 
@@ -242,7 +286,7 @@ extern "C" {
242
286
  //
243
287
  typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
244
288
 
245
- // Initialize a backend scheduler
289
+ // Initialize a backend scheduler, backends with low index are given priority over backends with high index
246
290
  GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
247
291
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
248
292
 
@@ -267,7 +311,9 @@ extern "C" {
267
311
  GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
268
312
  GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
269
313
 
270
- // Reset all assignments and allocators - must be called before changing the node backends
314
+ // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
315
+ // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
316
+ // The correct way to use this API is to discard the deallocated tensors and create new ones.
271
317
  GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
272
318
 
273
319
  // Set a callback to be called for each resulting node during graph compute
@@ -297,27 +343,10 @@ extern "C" {
297
343
  GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
298
344
  GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
299
345
 
300
- //
301
- // CPU backend
302
- //
303
-
304
- GGML_API ggml_backend_t ggml_backend_cpu_init(void);
305
-
306
- GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
307
- GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
308
- GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
309
- GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
310
-
311
- // Create a backend buffer from an existing pointer
346
+ // CPU buffer types are always available
312
347
  GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
313
348
  GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
314
349
 
315
- GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
316
-
317
- #ifdef GGML_USE_CPU_HBM
318
- GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
319
- #endif
320
-
321
350
  #ifdef __cplusplus
322
351
  }
323
352
  #endif
@@ -9,13 +9,15 @@ extern "C" {
9
9
  #endif
10
10
 
11
11
  // backend API
12
- GGML_API ggml_backend_t ggml_backend_blas_init(void);
12
+ GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
13
13
 
14
- GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
14
+ GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
15
15
 
16
16
  // number of threads used for conversion to float
17
17
  // for openblas and blis, this will also set the number of threads used for blas operations
18
- GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
18
+ GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
19
+
20
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
19
21
 
20
22
 
21
23
  #ifdef __cplusplus
@@ -34,6 +34,8 @@ extern "C" {
34
34
  */
35
35
  #define GGML_CANN_MAX_DEVICES 16
36
36
 
37
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
38
+
37
39
  /**
38
40
  * @brief Initializes the CANN backend for a specified device.
39
41
  *
@@ -44,7 +46,7 @@ extern "C" {
44
46
  * @param device The index of the device to initialize.
45
47
  * @return A pointer to the initialized backend instance, or nullptr on failure.
46
48
  */
47
- GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
49
+ GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
48
50
 
49
51
  /**
50
52
  * @brief Checks if a given backend is a CANN backend.
@@ -55,7 +57,7 @@ GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
55
57
  * @param backend The backend instance to check.
56
58
  * @return True if the backend is a CANN backend, false otherwise.
57
59
  */
58
- GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
60
+ GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
59
61
 
60
62
  /**
61
63
  * @brief Retrieves the CANN buffer type for a specified device.
@@ -67,7 +69,7 @@ GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
67
69
  * @return A pointer to the buffer type interface for the specified device, or
68
70
  * nullptr if the device index is out of range.
69
71
  */
70
- GGML_API ggml_backend_buffer_type_t
72
+ GGML_BACKEND_API ggml_backend_buffer_type_t
71
73
  ggml_backend_cann_buffer_type(int32_t device);
72
74
 
73
75
  /**
@@ -78,14 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device);
78
80
  *
79
81
  * @return The number of CANN devices available.
80
82
  */
81
- GGML_API int32_t ggml_backend_cann_get_device_count(void);
83
+ GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
82
84
 
83
85
  /**
84
86
  * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
85
87
  *
86
88
  * @return A pointer to the host buffer type interface.
87
89
  */
88
- GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
90
+ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
89
91
 
90
92
  /**
91
93
  * @brief Retrieves the description of a specific CANN device.
@@ -97,7 +99,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
97
99
  * @param description Pointer to a buffer where the description will be written.
98
100
  * @param description_size Size of the description buffer.
99
101
  */
100
- GGML_API void ggml_backend_cann_get_device_description(
102
+ GGML_BACKEND_API void ggml_backend_cann_get_device_description(
101
103
  int32_t device, char* description, size_t description_size);
102
104
 
103
105
  /**
@@ -112,7 +114,7 @@ GGML_API void ggml_backend_cann_get_device_description(
112
114
  * @param total Pointer to a variable where the total memory size will be
113
115
  * stored.
114
116
  */
115
- GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
117
+ GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
116
118
  size_t* free,
117
119
  size_t* total);
118
120
 
@@ -0,0 +1,38 @@
1
+ #pragma once
2
+
3
+ #ifndef __cplusplus
4
+ #error "This header is for C++ only"
5
+ #endif
6
+
7
+ #include "ggml.h"
8
+ #include "ggml-alloc.h"
9
+ #include "ggml-backend.h"
10
+ #include <memory>
11
+
12
+ // Smart pointers for ggml types
13
+
14
+ // ggml
15
+
16
+ struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
17
+ struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
18
+
19
+ typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
20
+ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
21
+
22
+ // ggml-alloc
23
+
24
+ struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
25
+
26
+ typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
27
+
28
+ // ggml-backend
29
+
30
+ struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } };
31
+ struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
32
+ struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } };
33
+ struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } };
34
+
35
+ typedef std::unique_ptr<ggml_backend, ggml_backend_deleter> ggml_backend_ptr;
36
+ typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
37
+ typedef std::unique_ptr<ggml_backend_event, ggml_backend_event_deleter> ggml_backend_event_ptr;
38
+ typedef std::unique_ptr<ggml_backend_sched, ggml_backend_sched_deleter> ggml_backend_sched_ptr;
@@ -0,0 +1,135 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ // the compute plan that needs to be prepared for ggml_graph_compute()
11
+ // since https://github.com/ggerganov/ggml/issues/287
12
+ struct ggml_cplan {
13
+ size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
14
+ uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
15
+
16
+ int n_threads;
17
+ struct ggml_threadpool * threadpool;
18
+
19
+ // abort ggml_graph_compute when true
20
+ ggml_abort_callback abort_callback;
21
+ void * abort_callback_data;
22
+ };
23
+
24
+ // numa strategies
25
+ enum ggml_numa_strategy {
26
+ GGML_NUMA_STRATEGY_DISABLED = 0,
27
+ GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
28
+ GGML_NUMA_STRATEGY_ISOLATE = 2,
29
+ GGML_NUMA_STRATEGY_NUMACTL = 3,
30
+ GGML_NUMA_STRATEGY_MIRROR = 4,
31
+ GGML_NUMA_STRATEGY_COUNT
32
+ };
33
+
34
+ GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
35
+ GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
36
+
37
+ GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
38
+ GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
39
+
40
+ GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
41
+ GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
42
+
43
+ GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
44
+ GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
45
+
46
+ GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
47
+ GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
48
+
49
+ GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
50
+ GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
51
+
52
+ GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
53
+ GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
54
+
55
+ GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
56
+ GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
57
+ GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
58
+ GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
59
+ GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
60
+
61
+ // ggml_graph_plan() has to be called before ggml_graph_compute()
62
+ // when plan.work_size > 0, caller must allocate memory for plan.work_data
63
+ GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
64
+ const struct ggml_cgraph * cgraph,
65
+ int n_threads, /* = GGML_DEFAULT_N_THREADS */
66
+ struct ggml_threadpool * threadpool /* = NULL */ );
67
+ GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
68
+
69
+ // same as ggml_graph_compute() but the work data is allocated as a part of the context
70
+ // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
71
+ GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
72
+
73
+ //
74
+ // system info
75
+ //
76
+
77
+ // x86
78
+ GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
79
+ GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
80
+ GGML_BACKEND_API int ggml_cpu_has_avx (void);
81
+ GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
82
+ GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
83
+ GGML_BACKEND_API int ggml_cpu_has_f16c (void);
84
+ GGML_BACKEND_API int ggml_cpu_has_fma (void);
85
+ GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
86
+ GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
87
+ GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
88
+ GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
89
+ GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void);
90
+ // ARM
91
+ GGML_BACKEND_API int ggml_cpu_has_neon (void);
92
+ GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
93
+ GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
94
+ GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
95
+ GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
96
+ GGML_BACKEND_API int ggml_cpu_has_sve (void);
97
+ GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
98
+ // other
99
+ GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
100
+ GGML_BACKEND_API int ggml_cpu_has_vsx (void);
101
+ GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
102
+ GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
103
+
104
+ // Internal types and functions exposed for tests and benchmarks
105
+
106
+ typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
107
+ const void * GGML_RESTRICT y, size_t by, int nrc);
108
+
109
+ struct ggml_type_traits_cpu {
110
+ ggml_from_float_t from_float;
111
+ ggml_vec_dot_t vec_dot;
112
+ enum ggml_type vec_dot_type;
113
+ int64_t nrows; // number of rows to process simultaneously
114
+ };
115
+
116
+ GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
117
+
118
+ GGML_BACKEND_API void ggml_cpu_init(void);
119
+
120
+ //
121
+ // CPU backend
122
+ //
123
+
124
+ GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
125
+
126
+ GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
127
+ GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
128
+ GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
129
+ GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
130
+
131
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
132
+
133
+ #ifdef __cplusplus
134
+ }
135
+ #endif