@fugood/llama.node 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CMakeLists.txt +6 -3
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +3 -3
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  24. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  25. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  26. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  27. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  28. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  29. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  31. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  32. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  36. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  37. package/src/llama.cpp/CMakeLists.txt +91 -1245
  38. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  39. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  40. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  41. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  42. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  43. package/src/llama.cpp/common/common.cpp +1116 -877
  44. package/src/llama.cpp/common/common.h +191 -77
  45. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  46. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  47. package/src/llama.cpp/common/log.h +1 -1
  48. package/src/llama.cpp/common/ngram-cache.h +10 -3
  49. package/src/llama.cpp/common/sampling.cpp +19 -10
  50. package/src/llama.cpp/docs/build.md +353 -0
  51. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  52. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  54. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  56. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  58. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  60. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  61. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  62. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  63. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  64. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  65. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  66. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  67. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  68. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  69. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  71. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  72. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  73. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  75. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  76. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  77. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  79. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  80. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  87. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  88. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  90. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  91. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  92. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  94. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  95. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  96. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  97. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  98. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  99. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  100. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  102. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  103. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  104. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  105. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  106. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  107. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  108. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  110. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  111. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  112. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  113. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  114. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  115. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  116. package/src/llama.cpp/examples/main/main.cpp +98 -75
  117. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  118. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  119. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  120. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  121. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  122. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  123. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  124. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  125. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  126. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  127. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  128. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  129. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  130. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  131. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  132. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  133. package/src/llama.cpp/examples/server/server.cpp +274 -671
  134. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  135. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  136. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  137. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  138. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  139. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  140. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  141. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  142. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  143. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  144. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  145. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  146. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  147. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  148. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  149. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  150. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  151. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  152. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  153. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  154. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  155. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  156. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  157. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  159. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  160. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  161. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  162. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  163. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  178. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  179. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  180. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  181. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  182. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  183. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  184. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  208. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  209. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  210. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  211. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  212. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  214. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  215. package/src/llama.cpp/models/.editorconfig +1 -0
  216. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  217. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  221. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  230. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  233. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  237. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  249. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  252. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  255. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  258. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  259. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  260. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  261. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  263. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  264. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  265. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  266. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  267. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  268. package/src/llama.cpp/requirements.txt +5 -4
  269. package/src/llama.cpp/scripts/build-info.sh +30 -0
  270. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  271. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  272. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  273. package/src/llama.cpp/src/llama-grammar.h +39 -0
  274. package/src/llama.cpp/src/llama-impl.h +26 -0
  275. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  276. package/src/llama.cpp/src/llama-sampling.h +56 -0
  277. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  278. package/src/llama.cpp/src/llama-vocab.h +130 -0
  279. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  280. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  281. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  282. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  283. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  284. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  285. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  286. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  287. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  289. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  290. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  291. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  292. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  293. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  294. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  295. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  296. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  297. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  298. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  299. package/bin/darwin/arm64/default.metallib +0 -0
  300. package/bin/darwin/x64/default.metallib +0 -0
  301. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  302. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  303. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  304. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  305. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  306. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  307. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  308. package/src/llama.cpp/ggml-opencl.h +0 -36
  309. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  310. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  311. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  314. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  315. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  316. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  317. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  318. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  319. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -52,109 +52,114 @@ int32_t cpu_get_num_math();
52
52
  // CLI argument parsing
53
53
  //
54
54
 
55
+ // dimensionality reduction methods, used by cvector-generator
56
+ enum dimre_method {
57
+ DIMRE_METHOD_PCA,
58
+ DIMRE_METHOD_MEAN,
59
+ };
60
+
55
61
  struct gpt_params {
56
62
  uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
57
63
 
58
64
  int32_t n_threads = cpu_get_num_math();
59
- int32_t n_threads_draft = -1;
60
- int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
61
- int32_t n_threads_batch_draft = -1;
62
- int32_t n_predict = -1; // new tokens to predict
63
- int32_t n_ctx = 512; // context size
64
- int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
65
- int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
66
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
67
- int32_t n_draft = 5; // number of tokens to draft during speculative decoding
68
- int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
69
- int32_t n_parallel = 1; // number of parallel sequences to decode
70
- int32_t n_sequences = 1; // number of sequences to decode
71
- float p_split = 0.1f; // speculative decoding split probability
72
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
73
- int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
74
- llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
75
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
76
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
77
- int32_t n_beams = 0; // if non-zero then use beam search of given width.
78
- int32_t grp_attn_n = 1; // group-attention factor
79
- int32_t grp_attn_w = 512; // group-attention width
80
- int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
81
- float rope_freq_base = 0.0f; // RoPE base frequency
82
- float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
65
+ int32_t n_threads_draft = -1;
66
+ int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
67
+ int32_t n_threads_batch_draft = -1;
68
+ int32_t n_predict = -1; // new tokens to predict
69
+ int32_t n_ctx = 0; // context size
70
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
71
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
72
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
73
+ int32_t n_draft = 5; // number of tokens to draft during speculative decoding
74
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
75
+ int32_t n_parallel = 1; // number of parallel sequences to decode
76
+ int32_t n_sequences = 1; // number of sequences to decode
77
+ float p_split = 0.1f; // speculative decoding split probability
78
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
79
+ int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
80
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
81
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
82
+ int32_t grp_attn_n = 1; // group-attention factor
83
+ int32_t grp_attn_w = 512; // group-attention width
84
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
85
+ float rope_freq_base = 0.0f; // RoPE base frequency
86
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
83
87
  float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
84
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
88
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
85
89
  float yarn_beta_fast = 32.0f; // YaRN low correction dim
86
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
87
- int32_t yarn_orig_ctx = 0; // YaRN original context length
90
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
91
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
88
92
  float defrag_thold = -1.0f; // KV cache defragmentation threshold
89
- std::string rpc_servers = ""; // comma separated list of RPC servers
90
93
 
91
94
  ggml_backend_sched_eval_callback cb_eval = nullptr;
92
95
  void * cb_eval_user_data = nullptr;
93
96
 
94
97
  ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
95
98
 
99
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
96
100
  enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
97
101
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
102
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
98
103
 
99
104
  // // sampling parameters
100
105
  struct llama_sampling_params sparams;
101
106
 
102
- std::string model = ""; // model path
103
- std::string model_draft = ""; // draft model for speculative decoding
107
+ std::string model = ""; // model path
108
+ std::string model_draft = ""; // draft model for speculative decoding
104
109
  std::string model_alias = "unknown"; // model alias
105
- std::string model_url = ""; // model url to download
106
- std::string hf_repo = ""; // HF repo
107
- std::string hf_file = ""; // HF file
110
+ std::string model_url = ""; // model url to download
111
+ std::string hf_token = ""; // HF token
112
+ std::string hf_repo = ""; // HF repo
113
+ std::string hf_file = ""; // HF file
108
114
  std::string prompt = "";
109
- std::string prompt_file = ""; // store the external prompt file name
110
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
111
- std::string input_prefix = ""; // string to prefix user inputs with
112
- std::string input_suffix = ""; // string to suffix user inputs with
113
- std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
114
- std::string logdir = ""; // directory in which to save YAML log files
115
+ std::string prompt_file = ""; // store the external prompt file name
116
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
117
+ std::string input_prefix = ""; // string to prefix user inputs with
118
+ std::string input_suffix = ""; // string to suffix user inputs with
119
+ std::string logdir = ""; // directory in which to save YAML log files
115
120
  std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
116
121
  std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
117
- std::string logits_file = ""; // file for saving *all* logits
122
+ std::string logits_file = ""; // file for saving *all* logits
123
+ std::string rpc_servers = ""; // comma separated list of RPC servers
118
124
 
125
+ std::vector<std::string> in_files; // all input files
126
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
119
127
  std::vector<llama_model_kv_override> kv_overrides;
120
128
 
121
129
  // TODO: avoid tuple, use struct
122
130
  std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
123
- std::string lora_base = ""; // base model path for the lora adapter
124
131
 
125
132
  std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
126
133
 
134
+ int32_t verbosity = 0;
127
135
  int32_t control_vector_layer_start = -1; // layer range for control vector
128
136
  int32_t control_vector_layer_end = -1; // layer range for control vector
129
137
 
130
- int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
131
- int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
132
- // (which is more convenient to use for plotting)
133
- //
134
- bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
135
- size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
138
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
139
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
140
+ // (which is more convenient to use for plotting)
141
+ //
142
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
143
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
136
144
 
137
- bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
138
- size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
145
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
146
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
139
147
 
140
- bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
141
- size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
148
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
149
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
142
150
 
143
- bool kl_divergence = false; // compute KL divergence
151
+ bool kl_divergence = false; // compute KL divergence
144
152
 
145
- bool random_prompt = false; // do not randomize prompt if none provided
153
+ bool usage = false; // print usage
146
154
  bool use_color = false; // use color to distinguish generations and inputs
147
- bool interactive = false; // interactive mode
148
- bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
149
155
  bool special = false; // enable special token output
156
+ bool interactive = false; // interactive mode
157
+ bool interactive_first = false; // wait for user input immediately
150
158
  bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
151
- bool chatml = false; // chatml mode (used for models trained on chatml syntax)
152
159
  bool prompt_cache_all = false; // save user input and generations to prompt cache
153
160
  bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
154
161
 
155
- bool embedding = false; // get only sentence embedding
156
- bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
157
- bool interactive_first = false; // wait for user input immediately
162
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
158
163
  bool multiline_input = false; // reverse the usage of `\`
159
164
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
160
165
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
@@ -162,7 +167,6 @@ struct gpt_params {
162
167
 
163
168
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
164
169
  bool ignore_eos = false; // ignore generated EOS tokens
165
- bool instruct = false; // instruction mode (used for Alpaca models)
166
170
  bool logits_all = false; // return logits for all tokens in the batch
167
171
  bool use_mmap = true; // use mmap for faster loads
168
172
  bool use_mlock = false; // use mlock to keep model in memory
@@ -180,8 +184,81 @@ struct gpt_params {
180
184
  // multimodal models (see examples/llava)
181
185
  std::string mmproj = ""; // path to multimodal projector
182
186
  std::vector<std::string> image; // path to image file(s)
187
+
188
+ // embedding
189
+ bool embedding = false; // get only sentence embedding
190
+ int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
191
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
192
+ std::string embd_sep = "\n"; // separator of embendings
193
+
194
+ // server params
195
+ int32_t port = 8080; // server listens on this network port
196
+ int32_t timeout_read = 600; // http read timeout in seconds
197
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
198
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests
199
+
200
+ std::string hostname = "127.0.0.1";
201
+ std::string public_path = "";
202
+ std::string chat_template = "";
203
+ std::string system_prompt = "";
204
+ bool enable_chat_template = true;
205
+
206
+ std::vector<std::string> api_keys;
207
+
208
+ std::string ssl_file_key = "";
209
+ std::string ssl_file_cert = "";
210
+
211
+ bool endpoint_slots = true;
212
+ bool endpoint_metrics = false;
213
+
214
+ bool log_json = false;
215
+
216
+ std::string slot_save_path;
217
+
218
+ float slot_prompt_similarity = 0.5f;
219
+
220
+ // batched-bench params
221
+ bool is_pp_shared = false;
222
+
223
+ std::vector<int32_t> n_pp;
224
+ std::vector<int32_t> n_tg;
225
+ std::vector<int32_t> n_pl;
226
+
227
+ // retrieval params
228
+ std::vector<std::string> context_files; // context files to embed
229
+
230
+ int32_t chunk_size = 64; // chunk size for context embedding
231
+
232
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
233
+
234
+ // passkey params
235
+ int32_t n_junk = 250; // number of times to repeat the junk text
236
+ int32_t i_pos = -1; // position of the passkey in the junk text
237
+
238
+ // imatrix params
239
+ std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
240
+
241
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
242
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
243
+ int32_t i_chunk = 0; // start processing from this chunk
244
+
245
+ bool process_output = false; // collect data for the output tensor
246
+ bool compute_ppl = true; // whether to compute perplexity
247
+
248
+ // cvector-generator params
249
+ int n_pca_batch = 100;
250
+ int n_pca_iterations = 1000;
251
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
252
+ std::string cvector_outfile = "control_vector.gguf";
253
+ std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
254
+ std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
255
+
256
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
257
+
258
+ std::string lora_outfile = "ggml-lora-merged-f16.gguf";
183
259
  };
184
260
 
261
+ void gpt_params_handle_hf_token(gpt_params & params);
185
262
  void gpt_params_handle_model_default(gpt_params & params);
186
263
 
187
264
  bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
@@ -199,7 +276,20 @@ std::vector<std::string> string_split(std::string input, char separator);
199
276
 
200
277
  std::string string_strip(const std::string & str);
201
278
  std::string string_get_sortable_timestamp();
202
- std::string string_random_prompt(std::mt19937 & rng);
279
+
280
+ template<class T>
281
+ static std::vector<T> string_split(const std::string & str, char delim) {
282
+ std::vector<T> values;
283
+ std::istringstream str_stream(str);
284
+ std::string token;
285
+ while (std::getline(str_stream, token, delim)) {
286
+ T value;
287
+ std::istringstream token_stream(token);
288
+ token_stream >> value;
289
+ values.push_back(value);
290
+ }
291
+ return values;
292
+ }
203
293
 
204
294
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
205
295
  void string_process_escapes(std::string & input);
@@ -212,6 +302,7 @@ bool fs_validate_filename(const std::string & filename);
212
302
  bool fs_create_directory_with_parents(const std::string & path);
213
303
 
214
304
  std::string fs_get_cache_directory();
305
+ std::string fs_get_cache_file(const std::string & filename);
215
306
 
216
307
  //
217
308
  // Model utils
@@ -223,8 +314,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
223
314
  struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
224
315
  struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
225
316
 
226
- struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
227
- struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
317
+ struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
318
+ struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
228
319
 
229
320
  // Batch utils
230
321
 
@@ -262,26 +353,50 @@ std::string llama_token_to_piece(
262
353
  llama_token token,
263
354
  bool special = true);
264
355
 
265
- // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
266
- // that takes into account the tokenizer type and decides how to handle the leading space
267
- //
268
- // detokenizes a vector of tokens into a string
269
- // should work similar to Python's `tokenizer.decode`
270
- // removes the leading space from the first non-BOS token
271
- std::string llama_detokenize_spm(
272
- llama_context * ctx,
273
- const std::vector<llama_token> & tokens);
274
-
275
356
  // detokenizes a vector of tokens into a string
276
357
  // should work similar to Python's `tokenizer.decode`
277
- std::string llama_detokenize_bpe(
358
+ // optionally renders special/control tokens
359
+ std::string llama_detokenize(
278
360
  llama_context * ctx,
279
- const std::vector<llama_token> & tokens);
361
+ const std::vector<llama_token> & tokens,
362
+ bool special = true);
280
363
 
281
364
  // Uses the value from the model metadata if possible, otherwise
282
365
  // defaults to true when model type is SPM, otherwise false.
283
366
  bool llama_should_add_bos_token(const llama_model * model);
284
367
 
368
+ //
369
+ // Chat template utils
370
+ //
371
+
372
+ // same with llama_chat_message, but uses std::string
373
+ struct llama_chat_msg {
374
+ std::string role;
375
+ std::string content;
376
+ };
377
+
378
+ // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
379
+ bool llama_chat_verify_template(const std::string & tmpl);
380
+
381
+ // CPP wrapper for llama_chat_apply_template
382
+ // If the built-in template is not supported, we default to chatml
383
+ // If the custom "tmpl" is not supported, we throw an error
384
+ std::string llama_chat_apply_template(const struct llama_model * model,
385
+ const std::string & tmpl,
386
+ const std::vector<llama_chat_msg> & chat,
387
+ bool add_ass);
388
+
389
+ // Format single message, while taking into account the position of that message in chat history
390
+ std::string llama_chat_format_single(const struct llama_model * model,
391
+ const std::string & tmpl,
392
+ const std::vector<llama_chat_msg> & past_msg,
393
+ const llama_chat_msg & new_msg,
394
+ bool add_ass);
395
+
396
+ // Returns an example of formatted chat
397
+ std::string llama_chat_format_example(const struct llama_model * model,
398
+ const std::string & tmpl);
399
+
285
400
  //
286
401
  // KV cache utils
287
402
  //
@@ -296,7 +411,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
296
411
  // Embedding utils
297
412
  //
298
413
 
299
- void llama_embd_normalize(const float * inp, float * out, int n);
414
+ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
300
415
 
301
416
  float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
302
417
 
@@ -340,4 +455,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
340
455
  void yaml_dump_non_result_info(
341
456
  FILE * stream, const gpt_params & params, const llama_context * lctx,
342
457
  const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
343
-
@@ -46,8 +46,12 @@ namespace grammar_parser {
46
46
  state.rules[rule_id] = rule;
47
47
  }
48
48
 
49
+ static bool is_digit_char(char c) {
50
+ return '0' <= c && c <= '9';
51
+ }
52
+
49
53
  static bool is_word_char(char c) {
50
- return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
54
+ return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
51
55
  }
52
56
 
53
57
  static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
@@ -99,6 +103,17 @@ namespace grammar_parser {
99
103
  return pos;
100
104
  }
101
105
 
106
+ static const char * parse_int(const char * src) {
107
+ const char * pos = src;
108
+ while (is_digit_char(*pos)) {
109
+ pos++;
110
+ }
111
+ if (pos == src) {
112
+ throw std::runtime_error(std::string("expecting integer at ") + src);
113
+ }
114
+ return pos;
115
+ }
116
+
102
117
  static std::pair<uint32_t, const char *> parse_char(const char * src) {
103
118
  if (*src == '\\') {
104
119
  switch (src[1]) {
@@ -137,6 +152,60 @@ namespace grammar_parser {
137
152
  bool is_nested) {
138
153
  size_t last_sym_start = out_elements.size();
139
154
  const char * pos = src;
155
+
156
+ auto handle_repetitions = [&](int min_times, int max_times) {
157
+
158
+ if (last_sym_start == out_elements.size()) {
159
+ throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
160
+ }
161
+
162
+ // apply transformation to previous symbol (last_sym_start to end) according to
163
+ // the following rewrite rules:
164
+ // S{m,n} --> S S S (m times) S'(n-m)
165
+ // S'(x) ::= S S'(x-1) |
166
+ // (... n-m definitions of these S' rules ...)
167
+ // S'(1) ::= S |
168
+ // S{m,} --> S S S (m times) S'
169
+ // S' ::= S S' |
170
+ // S* --> S{0,}
171
+ // --> S' ::= S S' |
172
+ // S+ --> S{1,}
173
+ // --> S S'
174
+ // S' ::= S S' |
175
+ // S? --> S{0,1}
176
+ // --> S'
177
+ // S' ::= S |
178
+
179
+ std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
180
+ if (min_times == 0) {
181
+ out_elements.resize(last_sym_start);
182
+ } else {
183
+ // Repeat the previous elements (min_times - 1) times
184
+ for (int i = 1; i < min_times; i++) {
185
+ out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
186
+ }
187
+ }
188
+
189
+ uint32_t last_rec_rule_id = 0;
190
+ auto n_opt = max_times < 0 ? 1 : max_times - min_times;
191
+
192
+ std::vector<llama_grammar_element> rec_rule(previous_elements);
193
+ for (int i = 0; i < n_opt; i++) {
194
+ rec_rule.resize(previous_elements.size());
195
+ uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
196
+ if (i > 0 || max_times < 0) {
197
+ rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
198
+ }
199
+ rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
200
+ rec_rule.push_back({LLAMA_GRETYPE_END, 0});
201
+ add_rule(state, rec_rule_id, rec_rule);
202
+ last_rec_rule_id = rec_rule_id;
203
+ }
204
+ if (n_opt > 0) {
205
+ out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
206
+ }
207
+ };
208
+
140
209
  while (*pos) {
141
210
  if (*pos == '"') { // literal string
142
211
  pos++;
@@ -197,40 +266,51 @@ namespace grammar_parser {
197
266
  throw std::runtime_error(std::string("expecting ')' at ") + pos);
198
267
  }
199
268
  pos = parse_space(pos + 1, is_nested);
200
- } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
201
- if (last_sym_start == out_elements.size()) {
202
- throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
203
- }
269
+ } else if (*pos == '.') { // any char
270
+ last_sym_start = out_elements.size();
271
+ out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
272
+ pos = parse_space(pos + 1, is_nested);
273
+ } else if (*pos == '*') {
274
+ pos = parse_space(pos + 1, is_nested);
275
+ handle_repetitions(0, -1);
276
+ } else if (*pos == '+') {
277
+ pos = parse_space(pos + 1, is_nested);
278
+ handle_repetitions(1, -1);
279
+ } else if (*pos == '?') {
280
+ pos = parse_space(pos + 1, is_nested);
281
+ handle_repetitions(0, 1);
282
+ } else if (*pos == '{') {
283
+ pos = parse_space(pos + 1, is_nested);
204
284
 
205
- // apply transformation to previous symbol (last_sym_start to end) according to
206
- // rewrite rules:
207
- // S* --> S' ::= S S' |
208
- // S+ --> S' ::= S S' | S
209
- // S? --> S' ::= S |
210
- uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
211
- std::vector<llama_grammar_element> sub_rule;
212
- // add preceding symbol to generated rule
213
- sub_rule.insert(
214
- sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
215
- if (*pos == '*' || *pos == '+') {
216
- // cause generated rule to recurse
217
- sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
285
+ if (!is_digit_char(*pos)) {
286
+ throw std::runtime_error(std::string("expecting an int at ") + pos);
218
287
  }
219
- // mark start of alternate def
220
- sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
221
- if (*pos == '+') {
222
- // add preceding symbol as alternate only for '+' (otherwise empty)
223
- sub_rule.insert(
224
- sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
225
- }
226
- sub_rule.push_back({LLAMA_GRETYPE_END, 0});
227
- add_rule(state, sub_rule_id, sub_rule);
288
+ const char * int_end = parse_int(pos);
289
+ int min_times = std::stoul(std::string(pos, int_end - pos));
290
+ pos = parse_space(int_end, is_nested);
228
291
 
229
- // in original rule, replace previous symbol with reference to generated rule
230
- out_elements.resize(last_sym_start);
231
- out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
292
+ int max_times = -1;
232
293
 
233
- pos = parse_space(pos + 1, is_nested);
294
+ if (*pos == '}') {
295
+ max_times = min_times;
296
+ pos = parse_space(pos + 1, is_nested);
297
+ } else if (*pos == ',') {
298
+ pos = parse_space(pos + 1, is_nested);
299
+
300
+ if (is_digit_char(*pos)) {
301
+ const char * int_end = parse_int(pos);
302
+ max_times = std::stoul(std::string(pos, int_end - pos));
303
+ pos = parse_space(int_end, is_nested);
304
+ }
305
+
306
+ if (*pos != '}') {
307
+ throw std::runtime_error(std::string("expecting '}' at ") + pos);
308
+ }
309
+ pos = parse_space(pos + 1, is_nested);
310
+ } else {
311
+ throw std::runtime_error(std::string("expecting ',' at ") + pos);
312
+ }
313
+ handle_repetitions(min_times, max_times);
234
314
  } else {
235
315
  break;
236
316
  }
@@ -325,6 +405,7 @@ namespace grammar_parser {
325
405
  case LLAMA_GRETYPE_CHAR_NOT: return true;
326
406
  case LLAMA_GRETYPE_CHAR_ALT: return true;
327
407
  case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
408
+ case LLAMA_GRETYPE_CHAR_ANY: return true;
328
409
  default: return false;
329
410
  }
330
411
  }
@@ -339,6 +420,7 @@ namespace grammar_parser {
339
420
  case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
340
421
  case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
341
422
  case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
423
+ case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
342
424
  }
343
425
  switch (elem.type) {
344
426
  case LLAMA_GRETYPE_END:
@@ -350,6 +432,7 @@ namespace grammar_parser {
350
432
  case LLAMA_GRETYPE_CHAR_NOT:
351
433
  case LLAMA_GRETYPE_CHAR_RNG_UPPER:
352
434
  case LLAMA_GRETYPE_CHAR_ALT:
435
+ case LLAMA_GRETYPE_CHAR_ANY:
353
436
  fprintf(file, "(\"");
354
437
  print_grammar_char(file, elem.value);
355
438
  fprintf(file, "\") ");
@@ -407,11 +490,15 @@ namespace grammar_parser {
407
490
  }
408
491
  print_grammar_char(file, elem.value);
409
492
  break;
493
+ case LLAMA_GRETYPE_CHAR_ANY:
494
+ fprintf(file, ".");
495
+ break;
410
496
  }
411
497
  if (is_char_element(elem)) {
412
498
  switch (rule[i + 1].type) {
413
499
  case LLAMA_GRETYPE_CHAR_ALT:
414
500
  case LLAMA_GRETYPE_CHAR_RNG_UPPER:
501
+ case LLAMA_GRETYPE_CHAR_ANY:
415
502
  break;
416
503
  default:
417
504
  fprintf(file, "] ");