@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -37,14 +37,15 @@ static gpt_params * g_params;
37
37
  static std::vector<llama_token> * g_input_tokens;
38
38
  static std::ostringstream * g_output_ss;
39
39
  static std::vector<llama_token> * g_output_tokens;
40
- static bool is_interacting = false;
40
+ static bool is_interacting = false;
41
+ static bool need_insert_eot = false;
41
42
 
42
- static bool file_exists(const std::string &path) {
43
+ static bool file_exists(const std::string & path) {
43
44
  std::ifstream f(path.c_str());
44
45
  return f.good();
45
46
  }
46
47
 
47
- static bool file_is_empty(const std::string &path) {
48
+ static bool file_is_empty(const std::string & path) {
48
49
  std::ifstream f;
49
50
  f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
50
51
  f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
@@ -99,7 +100,8 @@ static void write_logfile(
99
100
  static void sigint_handler(int signo) {
100
101
  if (signo == SIGINT) {
101
102
  if (!is_interacting && g_params->interactive) {
102
- is_interacting = true;
103
+ is_interacting = true;
104
+ need_insert_eot = true;
103
105
  } else {
104
106
  console::cleanup();
105
107
  printf("\n");
@@ -117,13 +119,24 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
117
119
  LOG_TEE("%s", text);
118
120
  }
119
121
 
122
+ static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
123
+ llama_chat_msg new_msg{role, content};
124
+ auto formatted = llama_chat_format_single(
125
+ model, g_params->chat_template, chat_msgs, new_msg, role == "user");
126
+ chat_msgs.push_back({role, content});
127
+ LOG("formatted: %s\n", formatted.c_str());
128
+ return formatted;
129
+ }
130
+
120
131
  int main(int argc, char ** argv) {
121
132
  gpt_params params;
122
133
  g_params = &params;
123
134
 
124
135
  if (!gpt_params_parse(argc, argv, params)) {
136
+ gpt_params_print_usage(argc, argv, params);
125
137
  return 1;
126
138
  }
139
+
127
140
  llama_sampling_params & sparams = params.sparams;
128
141
 
129
142
  #ifndef LOG_DISABLE_LOGS
@@ -180,9 +193,6 @@ int main(int argc, char ** argv) {
180
193
  LOG_TEE("%s: seed = %u\n", __func__, params.seed);
181
194
 
182
195
  std::mt19937 rng(params.seed);
183
- if (params.random_prompt) {
184
- params.prompt = string_random_prompt(rng);
185
- }
186
196
 
187
197
  LOG("%s: llama backend init\n", __func__);
188
198
  llama_backend_init();
@@ -191,6 +201,7 @@ int main(int argc, char ** argv) {
191
201
  llama_model * model;
192
202
  llama_context * ctx;
193
203
  llama_context * ctx_guidance = NULL;
204
+ std::vector<llama_chat_msg> chat_msgs;
194
205
  g_model = &model;
195
206
  g_ctx = &ctx;
196
207
 
@@ -216,6 +227,15 @@ int main(int argc, char ** argv) {
216
227
  __func__, n_ctx_train, n_ctx);
217
228
  }
218
229
 
230
+ // print chat template example in conversation mode
231
+ if (params.conversation) {
232
+ if (params.enable_chat_template) {
233
+ LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
234
+ } else {
235
+ LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
236
+ }
237
+ }
238
+
219
239
  // print system information
220
240
  {
221
241
  LOG_TEE("\n");
@@ -245,29 +265,38 @@ int main(int argc, char ** argv) {
245
265
  }
246
266
 
247
267
  const bool add_bos = llama_should_add_bos_token(model);
248
- GGML_ASSERT(llama_add_eos_token(model) != 1);
268
+ if (!llama_model_has_encoder(model)) {
269
+ GGML_ASSERT(llama_add_eos_token(model) != 1);
270
+ }
249
271
  LOG("add_bos: %d\n", add_bos);
250
272
 
251
273
  std::vector<llama_token> embd_inp;
252
274
 
253
- if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
254
- LOG("tokenize the prompt\n");
255
- if (params.chatml) {
256
- params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
275
+ {
276
+ auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
277
+ ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
278
+ : params.prompt;
279
+ if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
280
+ LOG("tokenize the prompt\n");
281
+ embd_inp = ::llama_tokenize(ctx, prompt, true, true);
282
+ } else {
283
+ LOG("use session tokens\n");
284
+ embd_inp = session_tokens;
257
285
  }
258
- embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
259
- } else {
260
- LOG("use session tokens\n");
261
- embd_inp = session_tokens;
262
- }
263
286
 
264
- LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
265
- LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
287
+ LOG("prompt: \"%s\"\n", log_tostr(prompt));
288
+ LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
289
+ }
266
290
 
267
291
  // Should not run without any tokens
268
292
  if (embd_inp.empty()) {
269
- embd_inp.push_back(llama_token_bos(model));
270
- LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
293
+ if (add_bos) {
294
+ embd_inp.push_back(llama_token_bos(model));
295
+ LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
296
+ } else {
297
+ LOG_TEE("error: input is empty\n");
298
+ return -1;
299
+ }
271
300
  }
272
301
 
273
302
  // Tokenize negative prompt
@@ -332,37 +361,13 @@ int main(int argc, char ** argv) {
332
361
  }
333
362
 
334
363
  // number of tokens to keep when resetting context
335
- if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
364
+ if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
336
365
  params.n_keep = (int)embd_inp.size();
337
366
  } else {
338
367
  params.n_keep += add_bos; // always keep the BOS token
339
368
  }
340
369
 
341
- // prefix & suffix for instruct mode
342
- const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
343
- const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
344
-
345
- LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
346
- LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
347
-
348
- // chatml prefix & suffix
349
- const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
350
- const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
351
-
352
- LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
353
- LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
354
-
355
- // in instruct mode, we inject a prefix and a suffix to each input by the user
356
- if (params.instruct) {
357
- params.interactive_first = true;
358
- params.antiprompt.emplace_back("### Instruction:\n\n");
359
- }
360
- // similar for chatml mode
361
- else if (params.chatml) {
362
- params.interactive_first = true;
363
- params.antiprompt.emplace_back("<|im_start|>user\n");
364
- }
365
- else if (params.conversation) {
370
+ if (params.conversation) {
366
371
  params.interactive_first = true;
367
372
  }
368
373
 
@@ -506,6 +511,7 @@ int main(int argc, char ** argv) {
506
511
  std::vector<int> input_tokens; g_input_tokens = &input_tokens;
507
512
  std::vector<int> output_tokens; g_output_tokens = &output_tokens;
508
513
  std::ostringstream output_ss; g_output_ss = &output_ss;
514
+ std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
509
515
 
510
516
  // the first thing we will do is to output the prompt, so set color accordingly
511
517
  console::set_display(console::prompt);
@@ -528,6 +534,24 @@ int main(int argc, char ** argv) {
528
534
  exit(1);
529
535
  }
530
536
 
537
+ if (llama_model_has_encoder(model)) {
538
+ int enc_input_size = embd_inp.size();
539
+ llama_token * enc_input_buf = embd_inp.data();
540
+
541
+ if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
542
+ LOG_TEE("%s : failed to eval\n", __func__);
543
+ return 1;
544
+ }
545
+
546
+ llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
547
+ if (decoder_start_token_id == -1) {
548
+ decoder_start_token_id = llama_token_bos(model);
549
+ }
550
+
551
+ embd_inp.clear();
552
+ embd_inp.push_back(decoder_start_token_id);
553
+ }
554
+
531
555
  while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
532
556
  // predict
533
557
  if (!embd.empty()) {
@@ -821,17 +845,24 @@ int main(int argc, char ** argv) {
821
845
  is_antiprompt = true;
822
846
  }
823
847
 
848
+ if (params.enable_chat_template) {
849
+ chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
850
+ }
824
851
  is_interacting = true;
825
852
  printf("\n");
826
- } else if (params.instruct || params.chatml) {
827
- is_interacting = true;
828
853
  }
829
854
  }
830
855
 
856
+ // if current token is not EOG, we add it to current assistant message
857
+ if (params.conversation) {
858
+ auto id = llama_sampling_last(ctx_sampling);
859
+ assistant_ss << llama_token_to_piece(ctx, id, false);
860
+ }
861
+
831
862
  if (n_past > 0 && is_interacting) {
832
863
  LOG("waiting for user input\n");
833
864
 
834
- if (params.conversation || params.instruct || params.chatml) {
865
+ if (params.conversation) {
835
866
  printf("\n> ");
836
867
  }
837
868
 
@@ -874,49 +905,41 @@ int main(int argc, char ** argv) {
874
905
 
875
906
  const size_t original_size = embd_inp.size();
876
907
 
877
- // instruct mode: insert instruction prefix
878
- if (params.instruct && !is_antiprompt) {
879
- LOG("inserting instruction prefix\n");
880
- n_consumed = embd_inp.size();
881
- embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
882
- }
883
- // chatml mode: insert user chat prefix
884
- if (params.chatml && !is_antiprompt) {
885
- LOG("inserting chatml prefix\n");
886
- n_consumed = embd_inp.size();
887
- embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
888
- }
889
908
  if (params.escape) {
890
909
  string_process_escapes(buffer);
891
910
  }
892
911
 
912
+ bool format_chat = params.conversation && params.enable_chat_template;
913
+ std::string user_inp = format_chat
914
+ ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
915
+ : std::move(buffer);
916
+ // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
893
917
  const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
894
- const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
918
+ const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
895
919
  const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
896
920
 
897
921
  LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
898
922
 
923
+ // if user stop generation mid-way, we must add EOT to finish model's last response
924
+ if (need_insert_eot && format_chat) {
925
+ llama_token eot = llama_token_eot(model);
926
+ embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
927
+ need_insert_eot = false;
928
+ }
929
+
899
930
  embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
900
931
  embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
901
932
  embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
902
933
 
903
- // instruct mode: insert response suffix
904
- if (params.instruct) {
905
- LOG("inserting instruction suffix\n");
906
- embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
907
- }
908
- // chatml mode: insert assistant chat suffix
909
- if (params.chatml) {
910
- LOG("inserting chatml suffix\n");
911
- embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
912
- }
913
-
914
934
  for (size_t i = original_size; i < embd_inp.size(); ++i) {
915
935
  const llama_token token = embd_inp[i];
916
936
  output_tokens.push_back(token);
917
937
  output_ss << llama_token_to_piece(ctx, token);
918
938
  }
919
939
 
940
+ // reset assistant message
941
+ assistant_ss.str("");
942
+
920
943
  n_remain -= line_inp.size();
921
944
  LOG("n_remain: %d\n", n_remain);
922
945
  } else {
@@ -935,7 +958,7 @@ int main(int argc, char ** argv) {
935
958
  }
936
959
 
937
960
  // end of generation
938
- if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
961
+ if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
939
962
  LOG_TEE(" [end of text]\n");
940
963
  break;
941
964
  }
@@ -1,12 +1,12 @@
1
1
  cmake_minimum_required(VERSION 3.12)
2
- project("main-cmake-pkg" C CXX)
3
- set(TARGET main-cmake-pkg)
2
+ project("llama-cli-cmake-pkg" C CXX)
3
+ set(TARGET llama-cli-cmake-pkg)
4
4
 
5
5
  find_package(Llama 0.0.1 REQUIRED)
6
6
 
7
7
  # Bake common functionality in with target. Because applications
8
8
  # using the relocatable Llama package should be outside of the
9
- # source tree, main-cmake-pkg pretends the dependencies are built-in.
9
+ # source tree, llama-cli-cmake-pkg pretends the dependencies are built-in.
10
10
  set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
11
11
  add_library(common OBJECT)
12
12
  file(GLOB _common_files
@@ -15,7 +15,7 @@ file(GLOB _common_files
15
15
  )
16
16
  target_sources(common PRIVATE ${_common_files})
17
17
 
18
- # If the common project was part of "main-cmake-pkg" the transient
18
+ # If the common project was part of "llama-cli-cmake-pkg" the transient
19
19
  # defines would automatically be attached. Because the common func-
20
20
  # tionality is separate, but dependent upon the defines, it must be
21
21
  # explicitly extracted from the "llama" target.
@@ -30,4 +30,3 @@ target_include_directories(${TARGET} PRIVATE ${_common_path})
30
30
  install(TARGETS ${TARGET} RUNTIME)
31
31
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
32
32
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
33
-
@@ -1,4 +1,4 @@
1
- set(TARGET parallel)
1
+ set(TARGET llama-parallel)
2
2
  add_executable(${TARGET} parallel.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
@@ -100,7 +100,8 @@ int main(int argc, char ** argv) {
100
100
 
101
101
  gpt_params params;
102
102
 
103
- if (gpt_params_parse(argc, argv, params) == false) {
103
+ if (!gpt_params_parse(argc, argv, params)) {
104
+ gpt_params_print_usage(argc, argv, params);
104
105
  return 1;
105
106
  }
106
107
 
@@ -1,4 +1,4 @@
1
- set(TARGET passkey)
1
+ set(TARGET llama-passkey)
2
2
  add_executable(${TARGET} passkey.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
@@ -6,46 +6,32 @@
6
6
  #include <string>
7
7
  #include <vector>
8
8
 
9
- int main(int argc, char ** argv) {
10
- gpt_params params;
11
-
12
- if (argc == 1 || argv[1][0] == '-') {
13
- printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
14
- return 1 ;
15
- }
16
-
17
- int seed = -1;
9
+ static void print_usage(int argc, char ** argv, const gpt_params & params) {
10
+ gpt_params_print_usage(argc, argv, params);
18
11
 
19
- int n_junk = 250; // number of times to repeat the junk text
20
- int n_keep = 32; // number of tokens in the prompt prefix
21
- int n_grp = 1; // if more than 1 - perform LongLM SelfExtend
22
- int i_pos = -1; // position of the passkey in the junk text
23
-
24
- if (argc >= 2) {
25
- params.model = argv[1];
26
- }
27
-
28
- if (argc >= 3) {
29
- n_junk = std::stoi(argv[2]);
30
- }
12
+ LOG_TEE("\nexample usage:\n");
13
+ LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
14
+ LOG_TEE("\n");
15
+ }
31
16
 
32
- if (argc >= 4) {
33
- n_grp = std::stoi(argv[3]);
34
- }
17
+ int main(int argc, char ** argv) {
18
+ gpt_params params;
35
19
 
36
- if (argc >= 5) {
37
- i_pos = std::stoi(argv[4]);
38
- }
20
+ params.n_junk = 250;
21
+ params.n_keep = 32;
22
+ params.i_pos = -1;
39
23
 
40
- if (argc >= 6) {
41
- seed = std::stoi(argv[5]);
24
+ if (!gpt_params_parse(argc, argv, params)) {
25
+ print_usage(argc, argv, params);
26
+ return 1;
42
27
  }
43
28
 
44
- if (seed == -1) {
45
- seed = time(NULL);
46
- }
29
+ srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
47
30
 
48
- srand(seed);
31
+ int n_junk = params.n_junk;
32
+ int n_keep = params.n_keep;
33
+ int n_grp = params.grp_attn_n;
34
+ int i_pos = params.i_pos;
49
35
 
50
36
  if (i_pos == -1) {
51
37
  i_pos = rand() % n_junk;
@@ -76,9 +62,7 @@ int main(int argc, char ** argv) {
76
62
 
77
63
  // initialize the model
78
64
 
79
- llama_model_params model_params = llama_model_default_params();
80
-
81
- model_params.n_gpu_layers = 99; // offload all layers to the GPU
65
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
82
66
 
83
67
  llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
84
68
 
@@ -89,13 +73,9 @@ int main(int argc, char ** argv) {
89
73
 
90
74
  // initialize the context
91
75
 
92
- llama_context_params ctx_params = llama_context_default_params();
76
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
93
77
 
94
- ctx_params.seed = seed;
95
- ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
96
- ctx_params.n_batch = 512;
97
- ctx_params.n_threads = params.n_threads;
98
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
78
+ ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
99
79
 
100
80
  GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
101
81
 
@@ -135,7 +115,7 @@ int main(int argc, char ** argv) {
135
115
  LOG_TEE("prompt tokens: %d\n", n_tokens_all);
136
116
  //LOG_TEE("prompt: %s\n", params.prompt.c_str());
137
117
 
138
- llama_batch batch = llama_batch_init(512, 0, 1);
118
+ llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
139
119
 
140
120
  int n_past = 0;
141
121
 
@@ -1,4 +1,4 @@
1
- set(TARGET perplexity)
1
+ set(TARGET llama-perplexity)
2
2
  add_executable(${TARGET} perplexity.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
@@ -476,7 +476,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
476
476
  }
477
477
 
478
478
  // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
479
- // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
479
+ // Run `./llama-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
480
480
  // Output: `perplexity: 13.5106 [114/114]`
481
481
  // BOS tokens will be added for each chunk before eval
482
482
 
@@ -1032,7 +1032,7 @@ struct winogrande_entry {
1032
1032
  std::vector<llama_token> seq_tokens[2];
1033
1033
  };
1034
1034
 
1035
- static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
1035
+ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
1036
1036
  std::vector<winogrande_entry> result;
1037
1037
  std::istringstream in(prompt);
1038
1038
  std::string line;
@@ -1964,12 +1964,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1964
1964
  int main(int argc, char ** argv) {
1965
1965
  gpt_params params;
1966
1966
 
1967
+ params.n_ctx = 512;
1968
+ params.logits_all = true;
1969
+
1967
1970
  if (!gpt_params_parse(argc, argv, params)) {
1971
+ gpt_params_print_usage(argc, argv, params);
1968
1972
  return 1;
1969
1973
  }
1970
1974
 
1971
- params.logits_all = true;
1972
-
1973
1975
  const int32_t n_ctx = params.n_ctx;
1974
1976
 
1975
1977
  if (n_ctx <= 0) {
@@ -1989,6 +1991,12 @@ int main(int argc, char ** argv) {
1989
1991
  params.n_batch = std::min(params.n_batch, n_kv);
1990
1992
  } else {
1991
1993
  params.n_batch = std::min(params.n_batch, params.n_ctx);
1994
+ if (params.kl_divergence) {
1995
+ params.n_parallel = 1;
1996
+ } else {
1997
+ // ensure there's at least enough seq_ids for HellaSwag
1998
+ params.n_parallel = std::max(4, params.n_parallel);
1999
+ }
1992
2000
  }
1993
2001
 
1994
2002
  if (params.ppl_stride > 0) {
@@ -2006,9 +2014,6 @@ int main(int argc, char ** argv) {
2006
2014
  fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
2007
2015
 
2008
2016
  std::mt19937 rng(params.seed);
2009
- if (params.random_prompt) {
2010
- params.prompt = string_random_prompt(rng);
2011
- }
2012
2017
 
2013
2018
  llama_backend_init();
2014
2019
  llama_numa_init(params.numa);
@@ -2016,9 +2021,6 @@ int main(int argc, char ** argv) {
2016
2021
  llama_model * model;
2017
2022
  llama_context * ctx;
2018
2023
 
2019
- // ensure there's at least enough seq_ids for HellaSwag
2020
- params.n_parallel = std::max(4, params.n_parallel);
2021
-
2022
2024
  // load the model and apply lora adapter, if any
2023
2025
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
2024
2026
  if (model == NULL) {
@@ -2027,6 +2029,7 @@ int main(int argc, char ** argv) {
2027
2029
  }
2028
2030
 
2029
2031
  const int n_ctx_train = llama_n_ctx_train(model);
2032
+
2030
2033
  if (params.n_ctx > n_ctx_train) {
2031
2034
  fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
2032
2035
  __func__, n_ctx_train, params.n_ctx);
@@ -1,4 +1,4 @@
1
- set(TARGET quantize)
1
+ set(TARGET llama-quantize)
2
2
  add_executable(${TARGET} quantize.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
@@ -16,41 +16,44 @@ struct quant_option {
16
16
  };
17
17
 
18
18
  static const std::vector<struct quant_option> QUANT_OPTIONS = {
19
- { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
20
- { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
21
- { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
22
- { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
23
- { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
24
- { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
25
- { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
26
- { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
27
- { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
28
- { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
29
- { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
30
- { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
31
- { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
32
- { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
33
- { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
34
- { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
35
- { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , },
36
- { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
37
- { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
38
- { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
39
- { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
40
- { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
41
- { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
42
- { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
43
- { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
44
- { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
45
- { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
46
- { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
47
- { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
48
- { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
49
- { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
50
- { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
51
- { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
19
+ { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
20
+ { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
21
+ { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
22
+ { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
23
+ { "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", },
24
+ { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
25
+ { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
26
+ { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
27
+ { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
28
+ { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
29
+ { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
30
+ { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
31
+ { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
32
+ { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
33
+ { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
34
+ { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
35
+ { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
36
+ { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
37
+ { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
38
+ { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
39
+ { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
40
+ { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
41
+ { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
42
+ { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
43
+ { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
44
+ { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
45
+ { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
46
+ { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
47
+ { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
48
+ { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
49
+ { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
50
+ { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
51
+ { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
52
+ { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
53
+ { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
54
+ { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
52
55
  // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
53
- { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
56
+ { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
54
57
  };
55
58
 
56
59
  static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
@@ -1,4 +1,4 @@
1
- set(TARGET quantize-stats)
1
+ set(TARGET llama-quantize-stats)
2
2
  add_executable(${TARGET} quantize-stats.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})