@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -14,7 +14,7 @@
14
14
 
15
15
  //#define GGML_ALLOCATOR_DEBUG
16
16
 
17
- //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
17
+ //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
18
18
  #define AT_PRINTF(...)
19
19
 
20
20
 
@@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
89
89
  size = GGML_PAD(size, talloc->alignment);
90
90
 
91
91
  if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
92
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
92
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
93
  __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
94
  GGML_ABORT("not enough space in the buffer");
95
95
  }
@@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
172
172
  best_fit_block = alloc->n_free_blocks - 1;
173
173
  } else {
174
174
  // this should never happen
175
- fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
175
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
176
176
  __func__, size, max_avail);
177
177
  GGML_ABORT("not enough space in the buffer");
178
178
  }
@@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
209
209
  }
210
210
  }
211
211
  }
212
- fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
212
+ GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
213
213
  for (int i = 0; i < 1024; i++) {
214
214
  if (alloc->allocated_tensors[i].tensor) {
215
- fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
215
+ GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
216
216
  alloc->allocated_tensors[i].offset,
217
217
  alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
218
218
  ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
219
219
  }
220
220
  }
221
- fprintf(stderr, "\n");
221
+ GGML_LOG_DEBUG("\n");
222
222
  }
223
223
  #endif
224
224
 
@@ -348,7 +348,6 @@ struct tensor_alloc {
348
348
  };
349
349
 
350
350
  struct leaf_alloc {
351
- int buffer_id;
352
351
  struct tensor_alloc leaf;
353
352
  };
354
353
 
@@ -467,18 +466,12 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
467
466
  return ggml_gallocr_hash_get(galloc, t)->allocated;
468
467
  }
469
468
 
470
- static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
471
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
472
- hn->buffer_id = buffer_id;
473
- hn->offset = offset;
474
- hn->allocated = true;
475
- }
476
-
477
469
  static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
478
470
  return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
479
471
  }
480
472
 
481
473
  static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
474
+ GGML_ASSERT(buffer_id >= 0);
482
475
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
483
476
 
484
477
  if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
@@ -541,7 +534,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
541
534
  size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
542
535
  hn->buffer_id = buffer_id;
543
536
  hn->offset = offset;
544
- return;
545
537
  }
546
538
  }
547
539
 
@@ -740,7 +732,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
740
732
  for (int i = 0; i < graph->n_leafs; i++) {
741
733
  struct ggml_tensor * leaf = graph->leafs[i];
742
734
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
743
- galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
744
735
  if (leaf->view_src || leaf->data) {
745
736
  galloc->leaf_allocs[i].leaf.buffer_id = -1;
746
737
  galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
@@ -768,13 +759,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
768
759
  // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
769
760
  if (new_size > cur_size || galloc->buffers[i] == NULL) {
770
761
  #ifndef NDEBUG
771
- fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
762
+ GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
772
763
  #endif
773
764
 
774
765
  ggml_backend_buffer_free(galloc->buffers[i]);
775
766
  galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
776
767
  if (galloc->buffers[i] == NULL) {
777
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
768
+ GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
778
769
  return false;
779
770
  }
780
771
  ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@@ -818,21 +809,25 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
818
809
  }
819
810
 
820
811
  static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
821
- size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
812
+ size_t node_size = 0;
813
+ if (!node->data && !node->view_src) {
814
+ GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
815
+ node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
816
+ }
822
817
  return talloc->size_max >= node_size;
823
818
  }
824
819
 
825
820
  static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
826
821
  if (galloc->n_nodes != graph->n_nodes) {
827
822
  #ifndef NDEBUG
828
- fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
823
+ GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
829
824
  #endif
830
825
  return true;
831
826
  }
832
827
 
833
828
  if (galloc->n_leafs != graph->n_leafs) {
834
829
  #ifndef NDEBUG
835
- fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
830
+ GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
836
831
  #endif
837
832
  return true;
838
833
  }
@@ -843,7 +838,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
843
838
 
844
839
  if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
845
840
  #ifndef NDEBUG
846
- fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
841
+ GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
847
842
  #endif
848
843
  return true;
849
844
  }
@@ -855,7 +850,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
855
850
  }
856
851
  if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
857
852
  #ifndef NDEBUG
858
- fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
853
+ GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
859
854
  #endif
860
855
  return true;
861
856
  }
@@ -869,14 +864,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
869
864
  if (ggml_gallocr_needs_realloc(galloc, graph)) {
870
865
  if (galloc->n_buffers == 1) {
871
866
  #ifndef NDEBUG
872
- fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
867
+ GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
873
868
  #endif
874
869
  if (!ggml_gallocr_reserve(galloc, graph)) {
875
870
  return false;
876
871
  }
877
872
  } else {
878
873
  #ifndef NDEBUG
879
- fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
874
+ GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
880
875
  #endif
881
876
  return false;
882
877
  }
@@ -940,7 +935,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
940
935
  ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
941
936
  if (buffer == NULL) {
942
937
  #ifndef NDEBUG
943
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
938
+ GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
944
939
  #endif
945
940
  for (size_t i = 0; i < *n_buffers; i++) {
946
941
  ggml_backend_buffer_free((*buffers)[i]);
@@ -990,7 +985,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
990
985
  }
991
986
 
992
987
  if (this_size > max_size) {
993
- fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
988
+ GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
994
989
  __func__, t->name,
995
990
  ggml_backend_buft_name(buft),
996
991
  this_size, max_size);
@@ -1022,7 +1017,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
1022
1017
 
1023
1018
  if (n_buffers == 0) {
1024
1019
  #ifndef NDEBUG
1025
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
1020
+ GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
1026
1021
  #endif
1027
1022
  return NULL;
1028
1023
  }
@@ -8,6 +8,8 @@
8
8
  extern "C" {
9
9
  #endif
10
10
 
11
+ #define GGML_BACKEND_API_VERSION 1
12
+
11
13
  //
12
14
  // Backend buffer type
13
15
  //
@@ -22,7 +24,7 @@ extern "C" {
22
24
  size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
23
25
  // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
24
26
  size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25
- // (optional) check if tensor data is in host memory (defaults to false)
27
+ // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
26
28
  bool (*is_host) (ggml_backend_buffer_type_t buft);
27
29
  };
28
30
 
@@ -37,7 +39,6 @@ extern "C" {
37
39
  //
38
40
 
39
41
  struct ggml_backend_buffer_i {
40
- const char * (*get_name) (ggml_backend_buffer_t buffer);
41
42
  // (optional) free the buffer
42
43
  void (*free_buffer) (ggml_backend_buffer_t buffer);
43
44
  // base address of the buffer
@@ -64,20 +65,20 @@ extern "C" {
64
65
  enum ggml_backend_buffer_usage usage;
65
66
  };
66
67
 
67
- ggml_backend_buffer_t ggml_backend_buffer_init(
68
+ GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
68
69
  ggml_backend_buffer_type_t buft,
69
70
  struct ggml_backend_buffer_i iface,
70
71
  void * context,
71
72
  size_t size);
72
73
 
73
74
  // do not use directly, use ggml_backend_tensor_copy instead
74
- bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
75
+ GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
75
76
 
76
77
  // multi-buffer
77
78
  // buffer that contains a collection of buffers
78
- ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
79
- bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
80
- void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
79
+ GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
80
+ GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
81
+ GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
81
82
 
82
83
  //
83
84
  // Backend (stream)
@@ -88,18 +89,16 @@ extern "C" {
88
89
 
89
90
  void (*free)(ggml_backend_t backend);
90
91
 
91
- // buffer allocation
92
- ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
93
-
94
92
  // (optional) asynchronous tensor data access
95
93
  void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
96
94
  void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
97
95
  bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
98
96
 
99
- // (optional) complete all pending operations
97
+ // (optional) complete all pending operations (required if the backend supports async operations)
100
98
  void (*synchronize)(ggml_backend_t backend);
101
99
 
102
- // (optional) compute graph with a plan (not used currently)
100
+ // (optional) graph plans (not used currently)
101
+ // compute graph with a plan
103
102
  ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
104
103
  void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
105
104
  // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
@@ -110,21 +109,6 @@ extern "C" {
110
109
  // compute graph (always async if supported by the backend)
111
110
  enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
112
111
 
113
- // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
114
- // new backends should implement the device interface instead
115
-
116
- // These functions are being moved to the device interface
117
- // check if the backend can compute an operation
118
- bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
119
-
120
- // check if the backend can use tensors allocated in a buffer type
121
- bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
122
-
123
- // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
124
- // these should be expensive operations with large batch sizes that may benefit from running on this backend
125
- // even if the weight has to be copied from the CPU temporarily
126
- bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
127
-
128
112
  // (optional) event synchronization
129
113
  // record an event on this stream
130
114
  void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
@@ -184,9 +168,8 @@ extern "C" {
184
168
  // check if the backend can use tensors allocated in a buffer type
185
169
  bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
186
170
 
187
- // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
188
- // these should be expensive operations with large batch sizes that may benefit from running on this backend
189
- // even if the weight has to be copied from the CPU temporarily
171
+ // (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
172
+ // these should be expensive operations that may benefit from running on this backend instead of the CPU backend
190
173
  bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
191
174
 
192
175
  // (optional) event synchronization
@@ -218,17 +201,55 @@ extern "C" {
218
201
  };
219
202
 
220
203
  struct ggml_backend_reg {
221
- // int api_version; // TODO: for dynamic loading
204
+ int api_version; // initialize to GGML_BACKEND_API_VERSION
222
205
  struct ggml_backend_reg_i iface;
223
206
  void * context;
224
207
  };
225
208
 
226
-
227
209
  // Internal backend registry API
228
- void ggml_backend_register(ggml_backend_reg_t reg);
229
- void ggml_backend_device_register(ggml_backend_dev_t device);
230
- // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
231
- // typedef ggml_backend_register_t * (*ggml_backend_init)(void);
210
+ GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
211
+ GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
212
+
213
+ // Add backend dynamic loading support to the backend
214
+
215
+ // Initialize the backend
216
+ typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
217
+ // Optional: obtain a score for the backend based on the system configuration
218
+ // Higher scores are preferred, 0 means the backend is not supported in the current system
219
+ typedef int (*ggml_backend_score_t)(void);
220
+
221
+ #ifdef GGML_BACKEND_DL
222
+ # ifdef __cplusplus
223
+ # define GGML_BACKEND_DL_IMPL(reg_fn) \
224
+ extern "C" { \
225
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
226
+ } \
227
+ ggml_backend_reg_t ggml_backend_init(void) { \
228
+ return reg_fn(); \
229
+ }
230
+ # define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
231
+ extern "C" { \
232
+ GGML_BACKEND_API int ggml_backend_score(void); \
233
+ } \
234
+ int ggml_backend_score(void) { \
235
+ return score_fn(); \
236
+ }
237
+ # else
238
+ # define GGML_BACKEND_DL_IMPL(reg_fn) \
239
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
240
+ ggml_backend_reg_t ggml_backend_init(void) { \
241
+ return reg_fn(); \
242
+ }
243
+ # define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
244
+ GGML_BACKEND_API int ggml_backend_score(void); \
245
+ int ggml_backend_score(void) { \
246
+ return score_fn(); \
247
+ }
248
+ # endif
249
+ #else
250
+ # define GGML_BACKEND_DL_IMPL(reg_fn)
251
+ # define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
252
+ #endif
232
253
 
233
254
  #ifdef __cplusplus
234
255
  }