@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -61,10 +61,10 @@ static size_t split_str_to_n_bytes(std::string str) {
61
61
  int n;
62
62
  if (str.back() == 'M') {
63
63
  sscanf(str.c_str(), "%d", &n);
64
- n_bytes = (size_t)n * 1024 * 1024; // megabytes
64
+ n_bytes = (size_t)n * 1000 * 1000; // megabytes
65
65
  } else if (str.back() == 'G') {
66
66
  sscanf(str.c_str(), "%d", &n);
67
- n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
67
+ n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
68
68
  } else {
69
69
  throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
70
70
  }
@@ -284,7 +284,7 @@ struct split_strategy {
284
284
  struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
285
285
  total_size += ggml_nbytes(t);
286
286
  }
287
- total_size = total_size / 1024 / 1024; // convert to megabytes
287
+ total_size = total_size / 1000 / 1000; // convert to megabytes
288
288
  printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
289
289
  i_split++;
290
290
  }
@@ -1,4 +1,4 @@
1
- set(TARGET gritlm)
1
+ set(TARGET llama-gritlm)
2
2
  add_executable(${TARGET} gritlm.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
@@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
44
44
 
45
45
  // clear previous kv_cache values (irrelevant for embeddings)
46
46
  llama_kv_cache_clear(ctx);
47
+ llama_set_embeddings(ctx, true);
47
48
  llama_set_causal_attn(ctx, false);
48
49
 
49
50
  // run model
@@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
98
99
  llama_token eos_token = llama_token_eos(mdl);
99
100
 
100
101
  llama_kv_cache_clear(ctx);
102
+ llama_set_embeddings(ctx, false);
101
103
  llama_set_causal_attn(ctx, true);
104
+
102
105
  llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
103
106
 
104
107
  std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
@@ -153,7 +156,9 @@ static std::string gritlm_instruction(const std::string & instruction) {
153
156
 
154
157
  int main(int argc, char * argv[]) {
155
158
  gpt_params params;
159
+
156
160
  if (!gpt_params_parse(argc, argv, params)) {
161
+ gpt_params_print_usage(argc, argv, params);
157
162
  return 1;
158
163
  }
159
164
 
@@ -164,8 +169,7 @@ int main(int argc, char * argv[]) {
164
169
 
165
170
  llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
166
171
 
167
- // create new context - set to embedding mode
168
- cparams.embeddings = true;
172
+ // create generation context
169
173
  llama_context * ctx = llama_new_context_with_model(mdl, cparams);
170
174
 
171
175
  // ### Embedding/Representation ###
@@ -1,4 +1,4 @@
1
- set(TARGET imatrix)
1
+ set(TARGET llama-imatrix)
2
2
  add_executable(${TARGET} imatrix.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
@@ -17,39 +17,37 @@
17
17
  #pragma warning(disable: 4244 4267) // possible loss of data
18
18
  #endif
19
19
 
20
+ static void print_usage(int argc, char ** argv, const gpt_params & params) {
21
+ gpt_params_print_usage(argc, argv, params);
22
+
23
+ LOG_TEE("\nexample usage:\n");
24
+ LOG_TEE("\n %s \\\n"
25
+ " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
26
+ " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
27
+ " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
28
+ LOG_TEE("\n");
29
+ }
30
+
20
31
  struct Stats {
21
32
  std::vector<float> values;
22
33
  std::vector<int> counts;
23
34
  int ncall = 0;
24
35
  };
25
36
 
26
- struct StatParams {
27
- std::string dataset;
28
- std::string ofile = "imatrix.dat";
29
- int n_output_frequency = 10;
30
- int verbosity = 1;
31
- int keep_every = 0;
32
- bool collect_output_weight = false;
33
- };
34
-
35
37
  class IMatrixCollector {
36
38
  public:
37
39
  IMatrixCollector() = default;
38
- void set_parameters(StatParams&& params) { m_params = std::move(params); }
40
+ void set_params(gpt_params params) { m_params = std::move(params); }
39
41
  bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
40
- void save_imatrix() const;
41
- bool load_imatrix(const char * file_name, bool add);
42
- static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
42
+ void save_imatrix(int ncall = -1) const;
43
+ bool load_imatrix(const char * file_name);
43
44
  private:
44
45
  std::unordered_map<std::string, Stats> m_stats;
45
- StatParams m_params;
46
+ gpt_params m_params;
46
47
  std::mutex m_mutex;
47
48
  int m_last_call = 0;
48
49
  std::vector<float> m_src1_data;
49
50
  std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
50
- //
51
- void save_imatrix(const char * file_name, const char * dataset) const;
52
- void keep_imatrix(int ncall) const;
53
51
  };
54
52
 
55
53
  // remove any prefix and suffixes from the name
@@ -85,7 +83,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
85
83
  if (t->op != GGML_OP_MUL_MAT) return false;
86
84
  // why are small batches ignored (<16 tokens)?
87
85
  if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
88
- if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
86
+ if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
89
87
  return true;
90
88
  }
91
89
 
@@ -129,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
129
127
  }
130
128
  else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
131
129
  fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
132
- exit(1); //GGML_ASSERT(false);
130
+ exit(1); //GGML_ABORT("fatal error");
133
131
  }
134
132
  if (m_params.verbosity > 1) {
135
133
  printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
@@ -153,28 +151,32 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
153
151
  for (int j = 0; j < (int)src1->ne[0]; ++j) {
154
152
  e.values[e_start + j] += x[j]*x[j];
155
153
  e.counts[e_start + j]++;
154
+ if (!std::isfinite(e.values[e_start + j])) {
155
+ fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
156
+ exit(1);
157
+ }
156
158
  }
157
159
  }
158
160
  }
159
161
  if (e.ncall > m_last_call) {
160
162
  m_last_call = e.ncall;
161
- if (m_last_call % m_params.n_output_frequency == 0) {
163
+ if (m_last_call % m_params.n_out_freq == 0) {
162
164
  save_imatrix();
163
165
  }
164
- if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
165
- keep_imatrix(m_last_call);
166
+ if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
167
+ save_imatrix(m_last_call);
166
168
  }
167
169
  }
168
170
  }
169
171
  } else {
170
- auto& e = m_stats[wname];
172
+ auto & e = m_stats[wname];
171
173
  if (e.values.empty()) {
172
174
  e.values.resize(src1->ne[0], 0);
173
175
  e.counts.resize(src1->ne[0], 0);
174
176
  }
175
177
  else if (e.values.size() != (size_t)src1->ne[0]) {
176
178
  fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
177
- exit(1); //GGML_ASSERT(false);
179
+ exit(1); //GGML_ABORT("fatal error");
178
180
  }
179
181
  ++e.ncall;
180
182
  if (m_params.verbosity > 1) {
@@ -185,15 +187,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
185
187
  for (int j = 0; j < (int)src1->ne[0]; ++j) {
186
188
  e.values[j] += x[j]*x[j];
187
189
  e.counts[j]++;
190
+ if (!std::isfinite(e.values[j])) {
191
+ fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
192
+ exit(1);
193
+ }
188
194
  }
189
195
  }
190
196
  if (e.ncall > m_last_call) {
191
197
  m_last_call = e.ncall;
192
- if (m_last_call % m_params.n_output_frequency == 0) {
198
+ if (m_last_call % m_params.n_out_freq == 0) {
193
199
  save_imatrix();
194
200
  }
195
- if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
196
- keep_imatrix(m_last_call);
201
+ if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
202
+ save_imatrix(m_last_call);
197
203
  }
198
204
  }
199
205
  }
@@ -201,33 +207,75 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
201
207
  return true;
202
208
  }
203
209
 
204
- void IMatrixCollector::save_imatrix() const {
205
- save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
206
- }
210
+ void IMatrixCollector::save_imatrix(int ncall) const {
211
+ auto fname = m_params.out_file;
212
+ if (fname.empty()) {
213
+ fname = "imatrix.dat";
214
+ }
207
215
 
208
- void IMatrixCollector::keep_imatrix(int ncall) const {
209
- auto file_name = m_params.ofile;
210
- if (file_name.empty()) file_name = "imatrix.dat";
211
- file_name += ".at_";
212
- file_name += std::to_string(ncall);
213
- save_imatrix(file_name.c_str(), m_params.dataset.c_str());
214
- }
216
+ if (ncall > 0) {
217
+ fname += ".at_";
218
+ fname += std::to_string(ncall);
219
+ }
220
+
221
+ // avoid writing imatrix entries that do not have full data
222
+ // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
223
+
224
+ int n_entries = 0;
225
+ std::vector<std::string> to_store;
226
+
227
+ bool is_first = true; // for printing
228
+ for (const auto & kv : m_stats) {
229
+ const int n_all = kv.second.counts.size();
230
+
231
+ if (n_all == 0) {
232
+ continue;
233
+ }
234
+
235
+ int n_zeros = 0;
236
+ for (const int c : kv.second.counts) {
237
+ if (c == 0) {
238
+ n_zeros++;
239
+ }
240
+ }
241
+
242
+ if (n_zeros != 0 && is_first) {
243
+ fprintf(stderr, "\n");
244
+ is_first = false;
245
+ }
246
+
247
+ if (n_zeros == n_all) {
248
+ fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
249
+ continue;
250
+ }
251
+
252
+ if (n_zeros > 0) {
253
+ fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
254
+ continue;
255
+ }
256
+
257
+ n_entries++;
258
+ to_store.push_back(kv.first);
259
+ }
260
+
261
+ if (to_store.size() < m_stats.size()) {
262
+ fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
263
+ }
215
264
 
216
- void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
217
265
  std::ofstream out(fname, std::ios::binary);
218
- int n_entries = m_stats.size();
219
266
  out.write((const char *) &n_entries, sizeof(n_entries));
220
- for (const auto & p : m_stats) {
221
- int len = p.first.size();
267
+ for (const auto & name : to_store) {
268
+ const auto & stat = m_stats.at(name);
269
+ int len = name.size();
222
270
  out.write((const char *) &len, sizeof(len));
223
- out.write(p.first.c_str(), len);
224
- out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
225
- int nval = p.second.values.size();
271
+ out.write(name.c_str(), len);
272
+ out.write((const char *) &stat.ncall, sizeof(stat.ncall));
273
+ int nval = stat.values.size();
226
274
  out.write((const char *) &nval, sizeof(nval));
227
275
  if (nval > 0) {
228
276
  std::vector<float> tmp(nval);
229
277
  for (int i = 0; i < nval; i++) {
230
- tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
278
+ tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
231
279
  }
232
280
  out.write((const char*)tmp.data(), nval*sizeof(float));
233
281
  }
@@ -236,26 +284,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
236
284
  // Write the number of call the matrix was computed with
237
285
  out.write((const char *) &m_last_call, sizeof(m_last_call));
238
286
 
239
- // Write the dataset name at the end of the file to later on specify it in quantize
240
- int n_dataset = strlen(dataset);
241
- out.write((const char *) &n_dataset, sizeof(n_dataset));
242
- out.write(dataset, n_dataset);
287
+ // Write the input filename at the end of the file to later on specify it in quantize
288
+ {
289
+ int len = m_params.prompt_file.size();
290
+ out.write((const char *) &len, sizeof(len));
291
+ out.write(m_params.prompt_file.c_str(), len);
292
+ }
243
293
 
244
294
  if (m_params.verbosity > 0) {
245
- fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
295
+ fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
246
296
  }
247
297
  }
248
298
 
249
- bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
250
- std::ifstream in(imatrix_file, std::ios::binary);
299
+ bool IMatrixCollector::load_imatrix(const char * fname) {
300
+ std::ifstream in(fname, std::ios::binary);
251
301
  if (!in) {
252
- printf("%s: failed to open %s\n",__func__,imatrix_file);
302
+ printf("%s: failed to open %s\n",__func__, fname);
253
303
  return false;
254
304
  }
255
305
  int n_entries;
256
306
  in.read((char*)&n_entries, sizeof(n_entries));
257
307
  if (in.fail() || n_entries < 1) {
258
- printf("%s: no data in file %s\n", __func__, imatrix_file);
308
+ printf("%s: no data in file %s\n", __func__, fname);
259
309
  return false;
260
310
  }
261
311
  for (int i = 0; i < n_entries; ++i) {
@@ -263,23 +313,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
263
313
  std::vector<char> name_as_vec(len+1);
264
314
  in.read((char *)name_as_vec.data(), len);
265
315
  if (in.fail()) {
266
- printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
316
+ printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
267
317
  return false;
268
318
  }
269
319
  name_as_vec[len] = 0;
270
320
  std::string name{name_as_vec.data()};
271
- auto& e = imatrix_data[std::move(name)];
321
+ auto & e = m_stats[std::move(name)];
272
322
  int ncall;
273
323
  in.read((char*)&ncall, sizeof(ncall));
274
324
  int nval;
275
325
  in.read((char *)&nval, sizeof(nval));
276
326
  if (in.fail() || nval < 1) {
277
327
  printf("%s: failed reading number of values for entry %d\n",__func__,i);
278
- imatrix_data = {};
328
+ m_stats = {};
279
329
  return false;
280
330
  }
281
331
 
282
- // When re-called from load_imatrix() with add set, this will already be created.
283
332
  if (e.values.empty()) {
284
333
  e.values.resize(nval, 0);
285
334
  e.counts.resize(nval, 0);
@@ -289,7 +338,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
289
338
  in.read((char*)tmp.data(), nval*sizeof(float));
290
339
  if (in.fail()) {
291
340
  printf("%s: failed reading data for entry %d\n",__func__,i);
292
- imatrix_data = {};
341
+ m_stats = {};
293
342
  return false;
294
343
  }
295
344
 
@@ -304,13 +353,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
304
353
  return true;
305
354
  }
306
355
 
307
- bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
308
- if (!add) {
309
- m_stats.clear();
310
- }
311
- return load_imatrix(file_name, m_stats);
312
- }
313
-
314
356
  static IMatrixCollector g_collector;
315
357
 
316
358
  static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -324,7 +366,7 @@ struct results_log_softmax {
324
366
  float prob;
325
367
  };
326
368
 
327
- static std::vector<float> softmax(const std::vector<float>& logits) {
369
+ static std::vector<float> softmax(const std::vector<float> & logits) {
328
370
  std::vector<float> probs(logits.size());
329
371
  float max_logit = logits[0];
330
372
  for (float v : logits) {
@@ -358,8 +400,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
358
400
 
359
401
  static void process_logits(
360
402
  int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
361
- double & nll, double & nll2, float * logit_history, float * prob_history
362
- ) {
403
+ double & nll, double & nll2, float * logit_history, float * prob_history) {
363
404
  std::mutex mutex;
364
405
  int counter = 0;
365
406
  auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@@ -391,8 +432,7 @@ static void process_logits(
391
432
  }
392
433
  }
393
434
 
394
- static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
395
-
435
+ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
396
436
  const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
397
437
  GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
398
438
  const int n_ctx = llama_n_ctx(ctx);
@@ -405,13 +445,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
405
445
  auto tim2 = std::chrono::high_resolution_clock::now();
406
446
  fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
407
447
 
408
- if (from_chunk > 0) {
409
- if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
410
- fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
448
+ if (params.i_chunk > 0) {
449
+ if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
450
+ fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
411
451
  return false;
412
452
  }
413
- fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
414
- tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
453
+ fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
454
+ tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
415
455
  }
416
456
 
417
457
  if (int(tokens.size()) < 2*n_ctx) {
@@ -424,7 +464,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
424
464
  std::vector<float> logit_history;
425
465
  std::vector<float> prob_history;
426
466
 
427
- if (compute_ppl) {
467
+ if (params.compute_ppl) {
428
468
  logit_history.resize(tokens.size());
429
469
  prob_history.resize(tokens.size());
430
470
  }
@@ -446,7 +486,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
446
486
  const int num_batches = (n_ctx + n_batch - 1) / n_batch;
447
487
 
448
488
  std::vector<float> logits;
449
- if (compute_ppl && num_batches > 1) {
489
+ if (params.compute_ppl && num_batches > 1) {
450
490
  logits.reserve((size_t)n_ctx * n_vocab);
451
491
  }
452
492
 
@@ -482,7 +522,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
482
522
  // restore the original token in case it was set to BOS
483
523
  tokens[batch_start] = token_org;
484
524
 
485
- if (compute_ppl && num_batches > 1) {
525
+ if (params.compute_ppl && num_batches > 1) {
486
526
  const auto * batch_logits = llama_get_logits(ctx);
487
527
  logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
488
528
  }
@@ -501,7 +541,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
501
541
  fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
502
542
  }
503
543
 
504
- if (compute_ppl) {
544
+ if (params.compute_ppl) {
505
545
  const int first = n_ctx/2;
506
546
  const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
507
547
  process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
@@ -516,7 +556,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
516
556
  }
517
557
  printf("\n");
518
558
 
519
- if (compute_ppl) {
559
+ if (params.compute_ppl) {
520
560
  nll2 /= count;
521
561
  nll /= count;
522
562
  const double ppl = exp(nll);
@@ -533,111 +573,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
533
573
  }
534
574
 
535
575
  int main(int argc, char ** argv) {
536
-
537
- StatParams sparams;
538
- std::string prev_result_file;
539
- std::string combine_files;
540
- bool compute_ppl = true;
541
- int from_chunk = 0;
542
- std::vector<char*> args;
543
- args.push_back(argv[0]);
544
- int iarg = 1;
545
- for (; iarg < argc-1; ++iarg) {
546
- std::string arg{argv[iarg]};
547
- if (arg == "-o" || arg == "--output-file") {
548
- sparams.ofile = argv[++iarg];
549
- }
550
- else if (arg == "-ofreq" || arg == "--output-frequency") {
551
- sparams.n_output_frequency = std::stoi(argv[++iarg]);
552
- }
553
- else if (arg == "-ow" || arg == "--output-weight") {
554
- sparams.collect_output_weight = std::stoi(argv[++iarg]);
555
- }
556
- else if (arg == "--verbosity") {
557
- sparams.verbosity = std::stoi(argv[++iarg]);
558
- } else if (arg == "--no-ppl") {
559
- compute_ppl = false;
560
- } else if (arg == "--keep-imatrix") {
561
- sparams.keep_every = std::stoi(argv[++iarg]);
562
- } else if (arg == "--continue-from") {
563
- prev_result_file = argv[++iarg];
564
- } else if (arg == "--combine") {
565
- combine_files = argv[++iarg];
566
- }
567
- else if (arg == "--from-chunk") {
568
- from_chunk = std::stoi(argv[++iarg]);
569
- } else {
570
- args.push_back(argv[iarg]);
571
- }
572
- }
573
- if (iarg < argc) {
574
- std::string arg{argv[iarg]};
575
- if (arg == "--no-ppl") {
576
- compute_ppl = false;
577
- } else {
578
- args.push_back(argv[iarg]);
579
- }
580
- }
581
-
582
576
  gpt_params params;
583
- params.n_batch = 512;
584
- if (!gpt_params_parse(args.size(), args.data(), params)) {
585
- return 1;
586
- }
587
577
 
578
+ params.n_ctx = 512;
588
579
  params.logits_all = true;
589
- params.n_batch = std::min(params.n_batch, params.n_ctx);
580
+ params.verbosity = 1;
590
581
 
591
- print_build_info();
592
-
593
- if (params.seed == LLAMA_DEFAULT_SEED) {
594
- params.seed = time(NULL);
582
+ if (!gpt_params_parse(argc, argv, params)) {
583
+ print_usage(argc, argv, params);
584
+ return 1;
595
585
  }
596
586
 
597
- fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
598
-
599
- std::mt19937 rng(params.seed);
600
- if (params.random_prompt) {
601
- params.prompt = string_random_prompt(rng);
602
- }
587
+ params.n_batch = std::min(params.n_batch, params.n_ctx);
603
588
 
604
- sparams.dataset = params.prompt_file;
605
- g_collector.set_parameters(std::move(sparams));
589
+ g_collector.set_params(params);
606
590
 
607
- if (!combine_files.empty()) {
608
- std::vector<std::string> files;
609
- size_t pos = 0;
610
- while (true) {
611
- auto new_pos = combine_files.find(',', pos);
612
- if (new_pos != std::string::npos) {
613
- files.emplace_back(combine_files.substr(pos, new_pos - pos));
614
- pos = new_pos + 1;
615
- } else {
616
- files.emplace_back(combine_files.substr(pos));
617
- break;
618
- }
619
- }
620
- if (files.size() < 2) {
621
- fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
591
+ for (const auto & in_file : params.in_files) {
592
+ printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
593
+ if (!g_collector.load_imatrix(in_file.c_str())) {
594
+ fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
622
595
  return 1;
623
596
  }
624
- printf("Combining the following %d files\n", int(files.size()));
625
- for (auto& file : files) {
626
- printf(" %s\n", file.c_str());
627
- if (!g_collector.load_imatrix(file.c_str(), true)) {
628
- fprintf(stderr, "Failed to load %s\n", file.c_str());
629
- return 1;
630
- }
631
- }
632
- g_collector.save_imatrix();
633
- return 0;
634
597
  }
635
598
 
636
- if (!prev_result_file.empty()) {
637
- if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
638
- fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
639
- return 1;
640
- }
599
+ if (params.in_files.size() > 1) {
600
+ printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
601
+ g_collector.save_imatrix();
641
602
  }
642
603
 
643
604
  llama_backend_init();
@@ -652,6 +613,7 @@ int main(int argc, char ** argv) {
652
613
  // init
653
614
  llama_model * model;
654
615
  llama_context * ctx;
616
+
655
617
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
656
618
  if (model == nullptr || ctx == nullptr) {
657
619
  fprintf(stderr, "%s : failed to init\n", __func__);
@@ -670,8 +632,7 @@ int main(int argc, char ** argv) {
670
632
  fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
671
633
  }
672
634
 
673
- bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
674
- if (!OK) {
635
+ if (!compute_imatrix(ctx, params)) {
675
636
  return 1;
676
637
  }
677
638
 
@@ -1,4 +1,4 @@
1
- set(TARGET infill)
1
+ set(TARGET llama-infill)
2
2
  add_executable(${TARGET} infill.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})