@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -2,10 +2,12 @@
2
2
  #undef NDEBUG
3
3
  #endif
4
4
 
5
- #include "llama.cpp" // TODO: not great
5
+ #define LLAMA_API_INTERNAL
6
+ #include "llama.h"
6
7
  #include "grammar-parser.h"
7
8
 
8
9
  #include <cassert>
10
+ #include <stdexcept>
9
11
 
10
12
  int main()
11
13
  {
@@ -112,10 +114,14 @@ int main()
112
114
  }
113
115
  }
114
116
 
115
- llama_grammar *grammar = NULL;
117
+ llama_grammar * grammar = NULL;
116
118
  std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
117
- grammar = llama_grammar_init(
118
- grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
119
+
120
+ grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
121
+ if (grammar == nullptr)
122
+ {
123
+ throw std::runtime_error("Failed to initialize llama_grammar");
124
+ }
119
125
 
120
126
  std::vector<std::vector<llama_grammar_element>> expected_stacks = {
121
127
  {
@@ -168,7 +174,7 @@ int main()
168
174
  }};
169
175
 
170
176
  auto index = 0;
171
- for (auto stack : grammar->stacks)
177
+ for (auto stack : llama_grammar_get_stacks(grammar))
172
178
  {
173
179
  // compare stack to expected_stack
174
180
  for (uint32_t i = 0; i < stack.size(); i++)
@@ -370,13 +376,13 @@ int main()
370
376
  },
371
377
  };
372
378
 
373
- std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[0], next_candidates);
379
+ std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
374
380
 
375
381
  std::vector<std::vector<llama_grammar_candidate>> all_rejects;
376
382
 
377
- for (std::size_t count = 0; count < grammar->stacks.size(); ++count)
383
+ for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
378
384
  {
379
- rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[count], next_candidates);
385
+ rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
380
386
  all_rejects.push_back(rejects);
381
387
  }
382
388
 
@@ -397,6 +403,6 @@ int main()
397
403
  delete[] candidate.code_points;
398
404
  candidate.code_points = nullptr;
399
405
  }
400
- delete grammar;
406
+ llama_grammar_free(grammar);
401
407
  return 0;
402
408
  }
@@ -60,7 +60,7 @@ static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test
60
60
  qfns.from_float(test_data, tmp_q.data(), test_size);
61
61
  qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
62
62
 
63
- qfns.from_float_reference(test_data, tmp_q.data(), test_size);
63
+ qfns.from_float_ref(test_data, tmp_q.data(), test_size);
64
64
  qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
65
65
 
66
66
  return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
@@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
285
285
  for (size_t size : params.test_sizes) {
286
286
  printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
287
287
  auto quantize_fn = [&](void) -> float {
288
- qfns.from_float_reference(test_data1, test_q1, size);
288
+ qfns.from_float_ref(test_data1, test_q1, size);
289
289
  return test_q1[0];
290
290
  };
291
291
  size_t quantized_size = ggml_row_size(type, size);
@@ -162,12 +162,12 @@ int main(int /*argc*/, const char ** /*argv*/) {
162
162
  x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
163
163
 
164
164
  // 100, 101, 102, ..., 172
165
- struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode, 1024);
165
+ struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
166
166
  // -67, -67, -67, ..., -67
167
- struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode, 1024); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
167
+ struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
168
168
 
169
169
  // 33, 34, 35, ..., 105
170
- struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode, 1024);
170
+ struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
171
171
 
172
172
  ggml_cgraph * gf = ggml_new_graph(ctx0);
173
173
 
@@ -218,4 +218,3 @@ int main(int /*argc*/, const char ** /*argv*/) {
218
218
 
219
219
  return 0;
220
220
  }
221
-
@@ -166,12 +166,12 @@ static void test_sampler_queue(
166
166
  for (auto s : samplers_sequence) {
167
167
  switch (s){
168
168
  case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break;
169
- case 'f': GGML_ASSERT(false && "tail_free test not implemented"); break;
170
- case 'y': GGML_ASSERT(false && "typical test not implemented"); break;
169
+ case 'f': GGML_ABORT("tail_free test not implemented"); break;
170
+ case 'y': GGML_ABORT("typical test not implemented"); break;
171
171
  case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break;
172
172
  case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break;
173
- case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
174
- default : GGML_ASSERT(false && "Unknown sampler"); break;
173
+ case 't': GGML_ABORT("temperature test not implemented"); break;
174
+ default : GGML_ABORT("Unknown sampler"); break;
175
175
  }
176
176
 
177
177
  llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
@@ -222,7 +222,7 @@ static void test_sampler_queue(
222
222
  GGML_ASSERT(candidates_p.data[0].id == max_token_id);
223
223
  GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
224
224
  } else {
225
- GGML_ASSERT(false);
225
+ GGML_ABORT("fatal error");
226
226
  }
227
227
  }
228
228
 
@@ -195,11 +195,11 @@ int main(int argc, char **argv) {
195
195
  const bool add_special = false;
196
196
 
197
197
  for (const auto & test_kv : k_tests) {
198
- const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special);
198
+ const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
199
199
 
200
200
  printf("\n");
201
201
  printf("src: '%s'\n", test_kv.first.c_str());
202
- printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
202
+ printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
203
203
  printf("tok: ");
204
204
  for (const auto & tok : res) {
205
205
  printf("%d ", tok);
@@ -216,8 +216,8 @@ int main(int argc, char **argv) {
216
216
  if (!correct) {
217
217
  fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
218
218
  fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
219
- llama_detokenize_bpe(ctx, res).c_str(),
220
- llama_detokenize_bpe(ctx, test_kv.second).c_str());
219
+ llama_detokenize(ctx, res).c_str(),
220
+ llama_detokenize(ctx, test_kv.second).c_str());
221
221
  fprintf(stderr, "%s : expected tokens: ", __func__);
222
222
  for (const auto & t : test_kv.second) {
223
223
  fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
@@ -253,7 +253,7 @@ int main(int argc, char **argv) {
253
253
  {
254
254
  const auto t_start = ggml_time_us();
255
255
 
256
- res = llama_tokenize(ctx, text, add_special);
256
+ res = llama_tokenize(ctx, text, add_special, false);
257
257
 
258
258
  const auto t_end = ggml_time_us();
259
259
 
@@ -272,7 +272,7 @@ int main(int argc, char **argv) {
272
272
  }
273
273
 
274
274
  for (const auto & tok : res) {
275
- //ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
275
+ //ofs << tok << " '" << string_strip(llama_detokenize(ctx, std::vector<int>{tok})) << "'" << std::endl;
276
276
  ofs << tok << "\n";
277
277
  }
278
278
  }
@@ -11,6 +11,7 @@
11
11
  #include <string>
12
12
  #include <thread>
13
13
  #include <vector>
14
+ #include <atomic>
14
15
 
15
16
  int main(int argc, char **argv) {
16
17
  if (argc < 2 || argc > 3) {
@@ -63,7 +64,10 @@ int main(int argc, char **argv) {
63
64
  }
64
65
  }
65
66
 
66
- GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
67
+ //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
68
+ if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
69
+ return 99;
70
+ }
67
71
 
68
72
  #ifdef _WIN32
69
73
  // We need this for unicode console support
@@ -74,7 +78,7 @@ int main(int argc, char **argv) {
74
78
  const int n_vocab = llama_n_vocab(model);
75
79
 
76
80
  for (int i = 0; i < n_vocab; ++i) {
77
- std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
81
+ std::string str = llama_detokenize(ctx, std::vector<int>(1, i));
78
82
  try {
79
83
  auto cps = unicode_cpts_from_utf8(str);
80
84
  std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
@@ -90,7 +94,7 @@ int main(int argc, char **argv) {
90
94
  fprintf(stderr, "]\n");
91
95
  return 2;
92
96
  }
93
- std::string check = llama_detokenize_bpe(ctx, tokens);
97
+ std::string check = llama_detokenize(ctx, tokens);
94
98
  if (check != str) {
95
99
  fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
96
100
  __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -108,26 +112,23 @@ int main(int argc, char **argv) {
108
112
 
109
113
  std::vector<std::thread> threads(nthread);
110
114
 
115
+ std::atomic_int errcode = {};
116
+
111
117
  for (int i = 0; i < nthread; ++i) {
112
- threads[i] = std::thread([i, nthread, ctx]() {
113
- for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
114
- if (!( // NOLINT
115
- (cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 &&
116
- (cp < 0x13 || cp > 0x17) && cp != 0x19 &&
117
- (cp < 0x1c || cp > 0x1e) &&
118
- (cp < 0xd800 || cp > 0xdfff) &&
119
- (cp < 0x00040000 || cp >= 0x000e0000)
120
- )) {
118
+ threads[i] = std::thread([i, nthread, ctx, &errcode]() {
119
+ for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
120
+ if ((0x0000D800 <= cp && cp <= 0x0000DFFF) || // surrogates \p{Cs}
121
+ (0x00040000 <= cp && cp <= 0x000E0000)) { // undefined \p{Cn}
121
122
  continue;
122
123
  }
123
124
 
124
125
  std::string str = unicode_cpt_to_utf8(cp);
125
126
  std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
126
- std::string check = llama_detokenize_bpe(ctx, tokens);
127
+ std::string check = llama_detokenize(ctx, tokens);
127
128
  if (cp != 9601 && str != check) {
128
- fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
129
+ fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
129
130
  cp, check.c_str(), check.length(), str.c_str(), str.length());
130
- std::exit(3);
131
+ errcode = 3;
131
132
  }
132
133
  }
133
134
  });
@@ -136,6 +137,10 @@ int main(int argc, char **argv) {
136
137
  for (auto & t : threads) {
137
138
  t.join();
138
139
  }
140
+
141
+ if (errcode) {
142
+ return errcode;
143
+ }
139
144
  }
140
145
 
141
146
  llama_free_model(model);
@@ -11,6 +11,7 @@
11
11
  #include <string>
12
12
  #include <thread>
13
13
  #include <vector>
14
+ #include <atomic>
14
15
 
15
16
  int main(int argc, char ** argv) {
16
17
  if (argc < 2) {
@@ -51,7 +52,10 @@ int main(int argc, char ** argv) {
51
52
  }
52
53
  }
53
54
 
54
- GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
55
+ //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
56
+ if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) {
57
+ return 99;
58
+ }
55
59
 
56
60
  #ifdef _WIN32
57
61
  // We need this for unicode console support
@@ -62,9 +66,9 @@ int main(int argc, char ** argv) {
62
66
  const int n_vocab = llama_n_vocab(model);
63
67
 
64
68
  for (int i = 0; i < n_vocab; ++i) {
65
- std::string str = llama_detokenize_spm(ctx, std::vector<int>(1, i));
66
- std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
67
- std::string check = llama_detokenize_spm(ctx, tokens);
69
+ std::string str = llama_detokenize(ctx, std::vector<int>(1, i), true);
70
+ std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
71
+ std::string check = llama_detokenize(ctx, tokens);
68
72
  if (check != str) {
69
73
  fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
70
74
  __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -78,20 +82,23 @@ int main(int argc, char ** argv) {
78
82
 
79
83
  std::vector<std::thread> threads(nthread);
80
84
 
85
+ std::atomic_int errcode = {};
86
+
81
87
  for (int i = 0; i < nthread; ++i) {
82
- threads[i] = std::thread([i, nthread, ctx]() {
83
- for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
84
- if (cp >= 0xd800 && cp <= 0xdfff) {
88
+ threads[i] = std::thread([i, nthread, ctx, &errcode]() {
89
+ for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
90
+ if ((0x0000D800 <= cp && cp <= 0x0000DFFF) || // surrogates \p{Cs}
91
+ (0x00040000 <= cp && cp <= 0x000E0000)) { // undefined \p{Cn}
85
92
  continue;
86
93
  }
87
94
 
88
95
  std::string str = unicode_cpt_to_utf8(cp);
89
- std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
90
- std::string check = llama_detokenize_spm(ctx, tokens);
96
+ std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
97
+ std::string check = llama_detokenize(ctx, tokens);
91
98
  if (cp != 9601 && str != check) {
92
- fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
99
+ fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
93
100
  cp, check.c_str(), check.length(), str.c_str(), str.length());
94
- std::exit(3);
101
+ errcode = 3;
95
102
  }
96
103
  }
97
104
  });
@@ -100,6 +107,10 @@ int main(int argc, char ** argv) {
100
107
  for (auto & t : threads) {
101
108
  t.join();
102
109
  }
110
+
111
+ if(errcode) {
112
+ return errcode;
113
+ }
103
114
  }
104
115
 
105
116
  llama_free_model(model);
Binary file
Binary file
@@ -1,5 +0,0 @@
1
- set(TARGET beam-search)
2
- add_executable(${TARGET} beam-search.cpp)
3
- install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -1,188 +0,0 @@
1
- #include "common.h"
2
- #include "llama.h"
3
-
4
- #include <cassert>
5
- #include <cinttypes>
6
- #include <cmath>
7
- #include <cstdio>
8
- #include <cstring>
9
- #include <ctime>
10
- #include <fstream>
11
- #include <iostream>
12
- #include <string>
13
- #include <vector>
14
-
15
- #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
16
- #include <signal.h>
17
- #include <unistd.h>
18
- #elif defined (_WIN32)
19
- #define WIN32_LEAN_AND_MEAN
20
- #ifndef NOMINMAX
21
- # define NOMINMAX
22
- #endif
23
- #include <windows.h>
24
- #include <signal.h>
25
- #endif
26
-
27
- // Used for debugging to print out beam tokens.
28
- struct ostream_beam_view {
29
- llama_context * ctx;
30
- llama_beam_view beam_view;
31
- };
32
-
33
- static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
34
- os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
35
- for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
36
- os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
37
- }
38
- return os << ')';
39
- }
40
-
41
- // Put here anything you want back in beam_search_callback().
42
- struct beam_search_callback_data {
43
- llama_context * ctx;
44
- std::vector<llama_token> response;
45
- };
46
-
47
- // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
48
- // For example, eob can be flagged due to maximum token length, stop words, etc.
49
- static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
50
- return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
51
- }
52
-
53
- // Function matching type llama_beam_search_callback_fn_t.
54
- // Custom callback example is called each time the beams lengths increase:
55
- // * Show progress by printing ',' following by number of convergent beam tokens if any.
56
- // * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
57
- // This is also called when the stop condition is met.
58
- // Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
59
- static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
60
- auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
61
- // Mark beams as EOS as needed.
62
- for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
63
- llama_beam_view& beam_view = beams_state.beam_views[i];
64
- if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
65
- beam_view.eob = true;
66
- }
67
- }
68
- printf(","); // Show progress
69
- if (const size_t n = beams_state.common_prefix_length) {
70
- callback_data.response.resize(callback_data.response.size() + n);
71
- assert(0u < beams_state.n_beams);
72
- const llama_token * tokens = beams_state.beam_views[0].tokens;
73
- std::copy(tokens, tokens + n, callback_data.response.end() - n);
74
- printf("%zu", n);
75
- }
76
- fflush(stdout);
77
- #if 1 // DEBUG: print current beams for this iteration
78
- std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
79
- for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
80
- std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
81
- }
82
- #endif
83
- }
84
-
85
- int main(int argc, char ** argv)
86
- {
87
- gpt_params params;
88
- //params.n_gpu_layers = 200;
89
-
90
- //---------------------------------
91
- // Print help :
92
- //---------------------------------
93
-
94
- if ( argc < 2 || argv[1][0] == '-' )
95
- {
96
- printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
97
- return 1 ;
98
- }
99
-
100
- //---------------------------------
101
- // Load parameters :
102
- //---------------------------------
103
-
104
- params.model = argv[1];
105
-
106
- params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
107
-
108
- if ( argc > 3 )
109
- {
110
- params.prompt = argv[3];
111
- }
112
-
113
- if ( params.prompt.empty() )
114
- {
115
- params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
116
- }
117
-
118
- //---------------------------------
119
- // Init LLM :
120
- //---------------------------------
121
-
122
- llama_backend_init();
123
- llama_numa_init(params.numa);
124
-
125
- llama_model * model;
126
- llama_context * ctx;
127
-
128
- std::tie(model, ctx) = llama_init_from_gpt_params( params );
129
-
130
- if ( model == NULL )
131
- {
132
- fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
133
- return 1;
134
- }
135
-
136
- //---------------------------------
137
- // Tokenize the prompt :
138
- //---------------------------------
139
-
140
- std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
141
-
142
- const size_t max_context_size = llama_n_ctx( ctx );
143
- const size_t max_tokens_list_size = max_context_size - 4 ;
144
-
145
- if (tokens_list.size() > max_tokens_list_size)
146
- {
147
- fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
148
- __func__ , tokens_list.size() , max_tokens_list_size );
149
- return 1;
150
- }
151
-
152
- fprintf( stderr, "\n\n" );
153
-
154
- // Print the tokens from the prompt :
155
-
156
- for( auto id : tokens_list )
157
- {
158
- std::cout << llama_token_to_piece(ctx, id);
159
- }
160
- std::cout << std::flush;
161
-
162
- int n_past = 0;
163
-
164
- if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
165
- {
166
- fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
167
- return 1;
168
- }
169
- n_past += tokens_list.size();
170
-
171
- beam_search_callback_data callback_data{ctx, {}};
172
- size_t const beam_width = static_cast<size_t>(params.n_beams);
173
- int const n_predict = 256;
174
- llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
175
-
176
- std::cout << "\n\n";
177
- for (llama_token const token_id : callback_data.response) {
178
- std::cout << llama_token_to_piece(ctx,token_id);
179
- }
180
- std::cout << std::endl;
181
-
182
- llama_free( ctx );
183
- llama_free_model( model );
184
-
185
- llama_backend_free();
186
-
187
- return 0;
188
- }