@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -107,6 +107,7 @@ int main(int argc, char ** argv) {
107
107
  g_params = &params;
108
108
 
109
109
  if (!gpt_params_parse(argc, argv, params)) {
110
+ gpt_params_print_usage(argc, argv, params);
110
111
  return 1;
111
112
  }
112
113
 
@@ -139,27 +140,6 @@ int main(int argc, char ** argv) {
139
140
  LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
140
141
  params.n_ctx = 8;
141
142
  }
142
- if (params.instruct) {
143
- printf("\n************\n");
144
- printf("%s: please use the 'main' tool for instruct mode\n", __func__);
145
- printf("************\n\n");
146
-
147
- return 0;
148
- }
149
- if (params.chatml) {
150
- printf("\n************\n");
151
- printf("%s: please use the 'main' tool for chatml mode\n", __func__);
152
- printf("************\n\n");
153
-
154
- return 0;
155
- }
156
- if (!params.antiprompt.empty()) {
157
- printf("\n************\n");
158
- printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
159
- printf("************\n\n");
160
-
161
- return 0;
162
- }
163
143
  if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
164
144
  printf("\n************\n");
165
145
  printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
@@ -167,20 +147,6 @@ int main(int argc, char ** argv) {
167
147
 
168
148
  return 0;
169
149
  }
170
- if (params.random_prompt) {
171
- printf("\n************\n");
172
- printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
173
- printf("************\n\n");
174
-
175
- return 0;
176
- }
177
- if (!params.path_prompt_cache.empty()) {
178
- printf("\n************\n");
179
- printf("%s: infill does not support prompt caching\n", __func__);
180
- printf("************\n\n");
181
-
182
- return 0;
183
- }
184
150
 
185
151
  if (params.rope_freq_base != 0.0) {
186
152
  LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
@@ -207,17 +173,13 @@ int main(int argc, char ** argv) {
207
173
 
208
174
  llama_model * model;
209
175
  llama_context * ctx;
210
- llama_context * ctx_guidance = NULL;
176
+
211
177
  g_model = &model;
212
178
  g_ctx = &ctx;
213
179
 
214
180
  // load the model and apply lora adapter, if any
215
181
  LOG("%s: load the model and apply lora adapter, if any\n", __func__);
216
182
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
217
- if (sparams.cfg_scale > 1.f) {
218
- struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
219
- ctx_guidance = llama_new_context_with_model(model, lparams);
220
- }
221
183
 
222
184
  if (model == NULL) {
223
185
  LOG_TEE("%s: error: unable to load model\n", __func__);
@@ -242,26 +204,28 @@ int main(int argc, char ** argv) {
242
204
  GGML_ASSERT(llama_add_eos_token(model) != 1);
243
205
  LOG("add_bos: %d\n", add_bos);
244
206
 
245
- bool suff_rm_leading_spc = params.escape;
246
- if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
247
- params.input_suffix.erase(0, 1);
248
- suff_rm_leading_spc = false;
249
- }
250
207
  std::vector<llama_token> embd_inp;
208
+ std::vector<llama_token> embd_end;
251
209
  std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
252
210
  std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
253
- const int space_token = 29871;
254
- if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
255
- inp_sfx.erase(inp_sfx.begin());
256
- }
211
+
212
+ GGML_ASSERT(llama_token_prefix(model) >= 0);
213
+ GGML_ASSERT(llama_token_suffix(model) >= 0);
214
+
257
215
  inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
216
+ inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
217
+
218
+ embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
219
+ embd_end = params.spm_infill ? inp_pfx : inp_sfx;
258
220
  if (add_bos) {
259
- inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
221
+ embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
222
+ }
223
+ embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
224
+
225
+ const llama_token middle_token = llama_token_middle(model);
226
+ if (middle_token >= 0) {
227
+ embd_inp.push_back(middle_token);
260
228
  }
261
- inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
262
- embd_inp = inp_pfx;
263
- embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
264
- embd_inp.push_back(llama_token_middle(model));
265
229
 
266
230
  LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
267
231
  LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
@@ -273,25 +237,6 @@ int main(int argc, char ** argv) {
273
237
  LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
274
238
  }
275
239
 
276
- // Tokenize negative prompt
277
- std::vector<llama_token> guidance_inp;
278
- int guidance_offset = 0;
279
- int original_prompt_len = 0;
280
- if (ctx_guidance) {
281
- LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
282
-
283
- guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
284
- LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
285
-
286
- std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
287
- LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
288
-
289
- original_prompt_len = original_inp.size();
290
- guidance_offset = (int)guidance_inp.size() - original_prompt_len;
291
- LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
292
- LOG("guidance_offset: %s", log_tostr(guidance_offset));
293
- }
294
-
295
240
  if ((int) embd_inp.size() > n_ctx - 4) {
296
241
  LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
297
242
  return 1;
@@ -319,15 +264,6 @@ int main(int argc, char ** argv) {
319
264
  LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
320
265
  }
321
266
 
322
- if (ctx_guidance) {
323
- LOG_TEE("\n");
324
- LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
325
- LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
326
- for (int i = 0; i < (int) guidance_inp.size(); i++) {
327
- LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
328
- }
329
- }
330
-
331
267
  if (params.n_keep > 0) {
332
268
  LOG_TEE("%s: static prompt based on n_keep: '", __func__);
333
269
  for (int i = 0; i < params.n_keep; i++) {
@@ -395,12 +331,11 @@ int main(int argc, char ** argv) {
395
331
  is_interacting = params.interactive_first;
396
332
  }
397
333
 
398
- bool input_echo = true;
334
+ bool input_echo = true;
399
335
 
400
- int n_past = 0;
401
- int n_remain = params.n_predict;
402
- int n_consumed = 0;
403
- int n_past_guidance = 0;
336
+ int n_past = 0;
337
+ int n_remain = params.n_predict;
338
+ int n_consumed = 0;
404
339
 
405
340
  std::vector<int> input_tokens; g_input_tokens = &input_tokens;
406
341
  std::vector<int> output_tokens; g_output_tokens = &output_tokens;
@@ -410,7 +345,6 @@ int main(int argc, char ** argv) {
410
345
  console::set_display(console::prompt);
411
346
 
412
347
  std::vector<llama_token> embd;
413
- std::vector<llama_token> embd_guidance;
414
348
 
415
349
  struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
416
350
 
@@ -436,7 +370,7 @@ int main(int argc, char ** argv) {
436
370
  // if we run out of context:
437
371
  // - take the n_keep first tokens from the original prompt (via n_past)
438
372
  // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
439
- if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
373
+ if (n_past + (int) embd.size() > n_ctx) {
440
374
  if (params.n_predict == -2) {
441
375
  LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
442
376
  break;
@@ -453,11 +387,7 @@ int main(int argc, char ** argv) {
453
387
 
454
388
  n_past -= n_discard;
455
389
 
456
- if (ctx_guidance) {
457
- n_past_guidance -= n_discard;
458
- }
459
-
460
- LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
390
+ LOG("after swap: n_past = %d\n", n_past);
461
391
 
462
392
  LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
463
393
 
@@ -465,45 +395,6 @@ int main(int argc, char ** argv) {
465
395
 
466
396
  // evaluate tokens in batches
467
397
  // embd is typically prepared beforehand to fit within a batch, but not always
468
-
469
- if (ctx_guidance) {
470
- int input_size = 0;
471
- llama_token * input_buf = NULL;
472
-
473
- if (n_past_guidance < (int) guidance_inp.size()) {
474
- // Guidance context should have the same data with these modifications:
475
- //
476
- // * Replace the initial prompt
477
- // * Shift everything by guidance_offset
478
- embd_guidance = guidance_inp;
479
- if (embd.begin() + original_prompt_len < embd.end()) {
480
- embd_guidance.insert(
481
- embd_guidance.end(),
482
- embd.begin() + original_prompt_len,
483
- embd.end()
484
- );
485
- }
486
-
487
- input_buf = embd_guidance.data();
488
- input_size = embd_guidance.size();
489
-
490
- LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
491
- } else {
492
- input_buf = embd.data();
493
- input_size = embd.size();
494
- }
495
-
496
- for (int i = 0; i < input_size; i += params.n_batch) {
497
- int n_eval = std::min(input_size - i, params.n_batch);
498
- if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
499
- LOG_TEE("%s : failed to eval\n", __func__);
500
- return 1;
501
- }
502
-
503
- n_past_guidance += n_eval;
504
- }
505
- }
506
-
507
398
  for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
508
399
  int n_eval = (int) embd.size() - i;
509
400
  if (n_eval > params.n_batch) {
@@ -525,11 +416,9 @@ int main(int argc, char ** argv) {
525
416
  }
526
417
 
527
418
  embd.clear();
528
- embd_guidance.clear();
529
419
 
530
420
  if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
531
-
532
- const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
421
+ const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
533
422
 
534
423
  llama_sampling_accept(ctx_sampling, ctx, id, true);
535
424
 
@@ -583,7 +472,6 @@ int main(int argc, char ** argv) {
583
472
 
584
473
  // if not currently processing queued inputs;
585
474
  if ((int) embd_inp.size() <= n_consumed) {
586
-
587
475
  // deal with eot token in infill mode
588
476
  if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
589
477
  if (is_interacting && !params.interactive_first) {
@@ -624,27 +512,26 @@ int main(int argc, char ** argv) {
624
512
  string_process_escapes(params.input_prefix);
625
513
  string_process_escapes(params.input_suffix);
626
514
  }
627
- suff_rm_leading_spc = params.escape;
628
- if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
629
- params.input_suffix.erase(0, 1);
630
- suff_rm_leading_spc = false;
631
- }
515
+
632
516
  // tokenize new prefix and suffix
633
517
  std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
634
518
  std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
635
- if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
636
- inp_sfx.erase(inp_sfx.begin());
637
- }
519
+
638
520
  inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
521
+ inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
522
+
523
+ embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
524
+ embd_end = params.spm_infill ? inp_pfx : inp_sfx;
639
525
  if (add_bos) {
640
- inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
526
+ embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
641
527
  }
642
- inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
643
- embd_inp = inp_pfx;
644
- embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
645
- embd_inp.push_back(llama_token_middle(model));
528
+ embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
529
+
530
+ if (middle_token >= 0) {
531
+ embd_inp.push_back(middle_token);
532
+ }
533
+
646
534
  embd.clear();
647
- embd_guidance.clear();
648
535
  n_remain = params.n_predict;
649
536
  n_past = 0;
650
537
  n_consumed = 0;
@@ -751,7 +638,6 @@ int main(int argc, char ** argv) {
751
638
  llama_print_timings(ctx);
752
639
  write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
753
640
 
754
- if (ctx_guidance) { llama_free(ctx_guidance); }
755
641
  llama_free(ctx);
756
642
  llama_free_model(model);
757
643
 
@@ -764,4 +650,3 @@ int main(int argc, char ** argv) {
764
650
 
765
651
  return 0;
766
652
  }
767
-