@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -254,18 +254,8 @@
254
254
 
255
255
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
256
256
 
257
- #define GGML_ASSERT(x) \
258
- do { \
259
- if (!(x)) { \
260
- fflush(stdout); \
261
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
262
- ggml_print_backtrace(); \
263
- abort(); \
264
- } \
265
- } while (0)
266
-
267
257
  #ifndef NDEBUG
268
- #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
258
+ #define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
269
259
  #elif defined(__GNUC__)
270
260
  #define GGML_UNREACHABLE() __builtin_unreachable()
271
261
  #elif defined(_MSC_VER)
@@ -274,6 +264,17 @@
274
264
  #define GGML_UNREACHABLE() ((void) 0)
275
265
  #endif
276
266
 
267
+ #ifdef __cplusplus
268
+ #define GGML_NORETURN [[noreturn]]
269
+ #elif defined(_MSC_VER)
270
+ #define GGML_NORETURN __declspec(noreturn)
271
+ #else
272
+ #define GGML_NORETURN _Noreturn
273
+ #endif
274
+
275
+ #define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
276
+ #define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
277
+
277
278
  // used to copy the number of elements and stride in bytes of tensors into local variables.
278
279
  // main purpose is to reduce code duplication and improve readability.
279
280
  //
@@ -312,10 +313,19 @@
312
313
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
313
314
  GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
314
315
 
316
+ #define GGML_TENSOR_BINARY_OP_LOCALS01 \
317
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
318
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
319
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
320
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
321
+
315
322
  #ifdef __cplusplus
316
323
  extern "C" {
317
324
  #endif
318
325
 
326
+ GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
327
+ GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
328
+
319
329
  enum ggml_status {
320
330
  GGML_STATUS_ALLOC_FAILED = -2,
321
331
  GGML_STATUS_FAILED = -1,
@@ -377,6 +387,9 @@ extern "C" {
377
387
  GGML_TYPE_F64 = 28,
378
388
  GGML_TYPE_IQ1_M = 29,
379
389
  GGML_TYPE_BF16 = 30,
390
+ GGML_TYPE_Q4_0_4_4 = 31,
391
+ GGML_TYPE_Q4_0_4_8 = 32,
392
+ GGML_TYPE_Q4_0_8_8 = 33,
380
393
  GGML_TYPE_COUNT,
381
394
  };
382
395
 
@@ -418,6 +431,9 @@ extern "C" {
418
431
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
419
432
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
420
433
  GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
434
+ GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
435
+ GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
436
+ GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
421
437
  };
422
438
 
423
439
  // available tensor operations:
@@ -585,11 +601,7 @@ extern "C" {
585
601
  struct ggml_tensor * grad;
586
602
  struct ggml_tensor * src[GGML_MAX_SRC];
587
603
 
588
- // performance
589
- int perf_runs;
590
- int64_t perf_cycles;
591
- int64_t perf_time_us;
592
-
604
+ // source tensor and offset for views
593
605
  struct ggml_tensor * view_src;
594
606
  size_t view_offs;
595
607
 
@@ -599,7 +611,7 @@ extern "C" {
599
611
 
600
612
  void * extra; // extra things e.g. for ggml-cuda.cu
601
613
 
602
- char padding[8];
614
+ // char padding[4];
603
615
  };
604
616
 
605
617
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -628,8 +640,11 @@ extern "C" {
628
640
  GGML_CGRAPH_EVAL_ORDER_COUNT
629
641
  };
630
642
 
643
+ typedef uint32_t ggml_bitset_t;
644
+
631
645
  struct ggml_hash_set {
632
646
  size_t size;
647
+ ggml_bitset_t * used;
633
648
  struct ggml_tensor ** keys;
634
649
  };
635
650
 
@@ -643,14 +658,9 @@ extern "C" {
643
658
  struct ggml_tensor ** grads;
644
659
  struct ggml_tensor ** leafs;
645
660
 
646
- struct ggml_hash_set visited_hash_table;
661
+ struct ggml_hash_set visited_hash_set;
647
662
 
648
663
  enum ggml_cgraph_eval_order order;
649
-
650
- // performance
651
- int perf_runs;
652
- int64_t perf_cycles;
653
- int64_t perf_time_us;
654
664
  };
655
665
 
656
666
  // scratch buffer
@@ -667,28 +677,6 @@ extern "C" {
667
677
  bool no_alloc; // don't allocate memory for the tensor data
668
678
  };
669
679
 
670
-
671
- // compute types
672
-
673
- // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
674
- // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
675
- enum ggml_task_type {
676
- GGML_TASK_TYPE_INIT = 0,
677
- GGML_TASK_TYPE_COMPUTE,
678
- GGML_TASK_TYPE_FINALIZE,
679
- };
680
-
681
- struct ggml_compute_params {
682
- enum ggml_task_type type;
683
-
684
- // ith = thread index, nth = number of threads
685
- int ith, nth;
686
-
687
- // work buffer for all threads
688
- size_t wsize;
689
- void * wdata;
690
- };
691
-
692
680
  // numa strategies
693
681
  enum ggml_numa_strategy {
694
682
  GGML_NUMA_STRATEGY_DISABLED = 0,
@@ -717,8 +705,6 @@ extern "C" {
717
705
  GGML_API int64_t ggml_cycles(void);
718
706
  GGML_API int64_t ggml_cycles_per_ms(void);
719
707
 
720
- GGML_API void ggml_print_backtrace(void);
721
-
722
708
  // accepts a UTF-8 path, even on Windows
723
709
  GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
724
710
 
@@ -733,9 +719,9 @@ extern "C" {
733
719
  GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
734
720
  GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
735
721
 
736
- GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
737
- GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
738
- GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
722
+ GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
723
+ GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
724
+ GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
739
725
 
740
726
  GGML_DEPRECATED(
741
727
  GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
@@ -756,7 +742,6 @@ extern "C" {
756
742
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
757
743
 
758
744
  GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
759
- GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
760
745
  GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
761
746
  GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
762
747
  GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
@@ -765,9 +750,16 @@ extern "C" {
765
750
  GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
766
751
  GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
767
752
 
753
+ GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
754
+ GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
755
+ GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
756
+ GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
757
+
768
758
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
769
759
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
770
760
 
761
+ GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
762
+
771
763
  // use this to compute the memory overhead of a tensor
772
764
  GGML_API size_t ggml_tensor_overhead(void);
773
765
 
@@ -1461,7 +1453,6 @@ extern "C" {
1461
1453
  // rotary position embedding
1462
1454
  // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
1463
1455
  // if mode & 2 == 1, GPT-NeoX style
1464
- // if mode & 4 == 1, ChatGLM style
1465
1456
  //
1466
1457
  // b is an int32 vector with size a->ne[2], it contains the positions
1467
1458
  // c is freq factors (e.g. phi3-128k), (optional)
@@ -1470,8 +1461,7 @@ extern "C" {
1470
1461
  struct ggml_tensor * a,
1471
1462
  struct ggml_tensor * b,
1472
1463
  int n_dims,
1473
- int mode,
1474
- int n_ctx);
1464
+ int mode);
1475
1465
 
1476
1466
  // in-place, returns view(a)
1477
1467
  GGML_API struct ggml_tensor * ggml_rope_inplace(
@@ -1479,8 +1469,7 @@ extern "C" {
1479
1469
  struct ggml_tensor * a,
1480
1470
  struct ggml_tensor * b,
1481
1471
  int n_dims,
1482
- int mode,
1483
- int n_ctx);
1472
+ int mode);
1484
1473
 
1485
1474
  // custom RoPE
1486
1475
  GGML_API struct ggml_tensor * ggml_rope_ext(
@@ -1490,8 +1479,7 @@ extern "C" {
1490
1479
  struct ggml_tensor * c,
1491
1480
  int n_dims,
1492
1481
  int mode,
1493
- int n_ctx,
1494
- int n_orig_ctx,
1482
+ int n_ctx_orig,
1495
1483
  float freq_base,
1496
1484
  float freq_scale,
1497
1485
  float ext_factor,
@@ -1507,8 +1495,7 @@ extern "C" {
1507
1495
  struct ggml_tensor * c,
1508
1496
  int n_dims,
1509
1497
  int mode,
1510
- int n_ctx,
1511
- int n_orig_ctx,
1498
+ int n_ctx_orig,
1512
1499
  float freq_base,
1513
1500
  float freq_scale,
1514
1501
  float ext_factor,
@@ -1522,8 +1509,7 @@ extern "C" {
1522
1509
  struct ggml_tensor * b,
1523
1510
  int n_dims,
1524
1511
  int mode,
1525
- int n_ctx,
1526
- int n_orig_ctx,
1512
+ int n_ctx_orig,
1527
1513
  float freq_base,
1528
1514
  float freq_scale,
1529
1515
  float ext_factor,
@@ -1538,8 +1524,7 @@ extern "C" {
1538
1524
  struct ggml_tensor * b,
1539
1525
  int n_dims,
1540
1526
  int mode,
1541
- int n_ctx,
1542
- int n_orig_ctx,
1527
+ int n_ctx_orig,
1543
1528
  float freq_base,
1544
1529
  float freq_scale,
1545
1530
  float ext_factor,
@@ -1550,7 +1535,7 @@ extern "C" {
1550
1535
 
1551
1536
  // compute correction dims for YaRN RoPE scaling
1552
1537
  GGML_CALL void ggml_rope_yarn_corr_dims(
1553
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1538
+ int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1554
1539
 
1555
1540
  // rotary position embedding backward, i.e compute dx from dy
1556
1541
  // a - dy
@@ -1561,16 +1546,13 @@ extern "C" {
1561
1546
  struct ggml_tensor * c,
1562
1547
  int n_dims,
1563
1548
  int mode,
1564
- int n_ctx,
1565
- int n_orig_ctx,
1549
+ int n_ctx_orig,
1566
1550
  float freq_base,
1567
1551
  float freq_scale,
1568
1552
  float ext_factor,
1569
1553
  float attn_factor,
1570
1554
  float beta_fast,
1571
- float beta_slow,
1572
- float xpos_base,
1573
- bool xpos_down);
1555
+ float beta_slow);
1574
1556
 
1575
1557
  // clamp
1576
1558
  // in-place, returns view(a)
@@ -2028,8 +2010,8 @@ extern "C" {
2028
2010
 
2029
2011
  // ggml_graph_plan() has to be called before ggml_graph_compute()
2030
2012
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
2031
- GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
2032
- GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
2013
+ GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
2014
+ GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
2033
2015
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
2034
2016
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
2035
2017
  GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
@@ -2413,15 +2395,17 @@ extern "C" {
2413
2395
  GGML_API int ggml_cpu_has_wasm_simd (void);
2414
2396
  GGML_API int ggml_cpu_has_blas (void);
2415
2397
  GGML_API int ggml_cpu_has_cuda (void);
2416
- GGML_API int ggml_cpu_has_clblast (void);
2417
2398
  GGML_API int ggml_cpu_has_vulkan (void);
2418
2399
  GGML_API int ggml_cpu_has_kompute (void);
2419
2400
  GGML_API int ggml_cpu_has_gpublas (void);
2420
2401
  GGML_API int ggml_cpu_has_sse3 (void);
2421
2402
  GGML_API int ggml_cpu_has_ssse3 (void);
2422
2403
  GGML_API int ggml_cpu_has_sycl (void);
2404
+ GGML_API int ggml_cpu_has_rpc (void);
2423
2405
  GGML_API int ggml_cpu_has_vsx (void);
2424
2406
  GGML_API int ggml_cpu_has_matmul_int8(void);
2407
+ GGML_API int ggml_cpu_has_cann (void);
2408
+ GGML_API int ggml_cpu_has_llamafile (void);
2425
2409
 
2426
2410
  //
2427
2411
  // Internal types and functions exposed for tests and benchmarks
@@ -2435,20 +2419,31 @@ extern "C" {
2435
2419
  #endif
2436
2420
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2437
2421
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2438
- typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2439
- const void * GGML_RESTRICT y, size_t by, int nrc);
2422
+ typedef void (*ggml_from_float_to_mat_t)
2423
+ (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
2424
+ typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2425
+ const void * GGML_RESTRICT y, size_t by, int nrc);
2426
+ typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
2427
+ const void * GGML_RESTRICT y, int nr, int nc);
2428
+ typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
2429
+ const void * GGML_RESTRICT y, int nr, int nc);
2440
2430
 
2441
2431
  typedef struct {
2442
- const char * type_name;
2443
- int blck_size;
2444
- size_t type_size;
2445
- bool is_quantized;
2446
- ggml_to_float_t to_float;
2447
- ggml_from_float_t from_float;
2448
- ggml_from_float_t from_float_reference;
2449
- ggml_vec_dot_t vec_dot;
2450
- enum ggml_type vec_dot_type;
2451
- int64_t nrows; // number of rows to process simultaneously;
2432
+ const char * type_name;
2433
+ int64_t blck_size;
2434
+ int64_t blck_size_interleave; // interleave elements in blocks
2435
+ size_t type_size;
2436
+ bool is_quantized;
2437
+ ggml_to_float_t to_float;
2438
+ ggml_from_float_t from_float;
2439
+ ggml_from_float_t from_float_ref;
2440
+ ggml_from_float_to_mat_t from_float_to_mat;
2441
+ ggml_vec_dot_t vec_dot;
2442
+ enum ggml_type vec_dot_type;
2443
+ int64_t nrows; // number of rows to process simultaneously
2444
+ int64_t ncols; // number of columns to process simultaneously
2445
+ ggml_gemv_t gemv;
2446
+ ggml_gemm_t gemm;
2452
2447
  } ggml_type_traits_t;
2453
2448
 
2454
2449
  GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);