@fugood/llama.node 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CMakeLists.txt +6 -3
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +3 -3
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  24. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  25. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  26. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  27. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  28. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  29. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  31. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  32. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  36. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  37. package/src/llama.cpp/CMakeLists.txt +91 -1245
  38. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  39. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  40. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  41. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  42. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  43. package/src/llama.cpp/common/common.cpp +1116 -877
  44. package/src/llama.cpp/common/common.h +191 -77
  45. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  46. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  47. package/src/llama.cpp/common/log.h +1 -1
  48. package/src/llama.cpp/common/ngram-cache.h +10 -3
  49. package/src/llama.cpp/common/sampling.cpp +19 -10
  50. package/src/llama.cpp/docs/build.md +353 -0
  51. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  52. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  54. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  56. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  58. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  60. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  61. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  62. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  63. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  64. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  65. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  66. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  67. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  68. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  69. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  71. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  72. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  73. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  75. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  76. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  77. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  79. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  80. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  87. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  88. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  90. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  91. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  92. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  94. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  95. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  96. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  97. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  98. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  99. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  100. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  102. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  103. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  104. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  105. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  106. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  107. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  108. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  110. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  111. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  112. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  113. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  114. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  115. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  116. package/src/llama.cpp/examples/main/main.cpp +98 -75
  117. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  118. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  119. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  120. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  121. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  122. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  123. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  124. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  125. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  126. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  127. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  128. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  129. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  130. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  131. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  132. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  133. package/src/llama.cpp/examples/server/server.cpp +274 -671
  134. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  135. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  136. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  137. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  138. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  139. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  140. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  141. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  142. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  143. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  144. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  145. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  146. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  147. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  148. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  149. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  150. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  151. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  152. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  153. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  154. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  155. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  156. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  157. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  159. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  160. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  161. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  162. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  163. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  178. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  179. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  180. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  181. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  182. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  183. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  184. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  208. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  209. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  210. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  211. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  212. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  214. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  215. package/src/llama.cpp/models/.editorconfig +1 -0
  216. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  217. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  221. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  230. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  233. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  237. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  249. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  252. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  255. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  258. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  259. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  260. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  261. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  263. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  264. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  265. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  266. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  267. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  268. package/src/llama.cpp/requirements.txt +5 -4
  269. package/src/llama.cpp/scripts/build-info.sh +30 -0
  270. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  271. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  272. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  273. package/src/llama.cpp/src/llama-grammar.h +39 -0
  274. package/src/llama.cpp/src/llama-impl.h +26 -0
  275. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  276. package/src/llama.cpp/src/llama-sampling.h +56 -0
  277. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  278. package/src/llama.cpp/src/llama-vocab.h +130 -0
  279. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  280. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  281. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  282. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  283. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  284. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  285. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  286. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  287. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  289. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  290. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  291. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  292. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  293. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  294. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  295. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  296. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  297. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  298. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  299. package/bin/darwin/arm64/default.metallib +0 -0
  300. package/bin/darwin/x64/default.metallib +0 -0
  301. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  302. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  303. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  304. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  305. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  306. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  307. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  308. package/src/llama.cpp/ggml-opencl.h +0 -36
  309. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  310. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  311. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  314. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  315. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  316. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  317. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  318. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  319. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -33,17 +33,15 @@
33
33
 
34
34
  #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
35
35
 
36
- #define LLAMA_MAX_RNG_STATE (64*1024)
37
-
38
36
  #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
39
37
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
40
38
  #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
41
39
 
42
40
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
43
- #define LLAMA_SESSION_VERSION 6
41
+ #define LLAMA_SESSION_VERSION 8
44
42
 
45
43
  #define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
46
- #define LLAMA_STATE_SEQ_VERSION 1
44
+ #define LLAMA_STATE_SEQ_VERSION 2
47
45
 
48
46
  #ifdef __cplusplus
49
47
  extern "C" {
@@ -67,6 +65,7 @@ extern "C" {
67
65
  LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
68
66
  LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
69
67
  LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
68
+ LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
70
69
  };
71
70
 
72
71
  // pre-tokenization types
@@ -86,6 +85,14 @@ extern "C" {
86
85
  LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
87
86
  LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
88
87
  LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
88
+ LLAMA_VOCAB_PRE_TYPE_PORO = 15,
89
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
90
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
91
+ LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
92
+ LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
93
+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
94
+ LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
95
+ LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
89
96
  };
90
97
 
91
98
  // note: these values should be synchronized with ggml_rope
@@ -97,7 +104,7 @@ extern "C" {
97
104
  LLAMA_ROPE_TYPE_GLM = 4,
98
105
  };
99
106
 
100
- enum llama_token_type {
107
+ enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
101
108
  LLAMA_TOKEN_TYPE_UNDEFINED = 0,
102
109
  LLAMA_TOKEN_TYPE_NORMAL = 1,
103
110
  LLAMA_TOKEN_TYPE_UNKNOWN = 2,
@@ -107,13 +114,27 @@ extern "C" {
107
114
  LLAMA_TOKEN_TYPE_BYTE = 6,
108
115
  };
109
116
 
117
+ enum llama_token_attr {
118
+ LLAMA_TOKEN_ATTR_UNDEFINED = 0,
119
+ LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0,
120
+ LLAMA_TOKEN_ATTR_UNUSED = 1 << 1,
121
+ LLAMA_TOKEN_ATTR_NORMAL = 1 << 2,
122
+ LLAMA_TOKEN_ATTR_CONTROL = 1 << 3, // SPECIAL?
123
+ LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
124
+ LLAMA_TOKEN_ATTR_BYTE = 1 << 5,
125
+ LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6,
126
+ LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7,
127
+ LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8,
128
+ LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9,
129
+ };
130
+
110
131
  // model file types
111
132
  enum llama_ftype {
112
133
  LLAMA_FTYPE_ALL_F32 = 0,
113
134
  LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
114
135
  LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
115
136
  LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
116
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
137
+ // LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
117
138
  // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
118
139
  // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
119
140
  LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
@@ -142,6 +163,9 @@ extern "C" {
142
163
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
143
164
  LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
144
165
  LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
166
+ LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
167
+ LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
168
+ LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
145
169
 
146
170
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
147
171
  };
@@ -159,6 +183,13 @@ extern "C" {
159
183
  LLAMA_POOLING_TYPE_NONE = 0,
160
184
  LLAMA_POOLING_TYPE_MEAN = 1,
161
185
  LLAMA_POOLING_TYPE_CLS = 2,
186
+ LLAMA_POOLING_TYPE_LAST = 3,
187
+ };
188
+
189
+ enum llama_attention_type {
190
+ LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
191
+ LLAMA_ATTENTION_TYPE_CAUSAL = 0,
192
+ LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
162
193
  };
163
194
 
164
195
  enum llama_split_mode {
@@ -278,7 +309,7 @@ extern "C" {
278
309
 
279
310
  enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
280
311
  enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
281
- // (ignored if no pooling layer)
312
+ enum llama_attention_type attention_type; // attention type to use for embeddings
282
313
 
283
314
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
284
315
  float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -351,6 +382,9 @@ extern "C" {
351
382
  // modifies a preceding LLAMA_GRETYPE_CHAR or
352
383
  // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
353
384
  LLAMA_GRETYPE_CHAR_ALT = 6,
385
+
386
+ // any character (.)
387
+ LLAMA_GRETYPE_CHAR_ANY = 7,
354
388
  };
355
389
 
356
390
  typedef struct llama_grammar_element {
@@ -378,6 +412,9 @@ extern "C" {
378
412
  const char * content;
379
413
  } llama_chat_message;
380
414
 
415
+ // lora adapter
416
+ struct llama_lora_adapter;
417
+
381
418
  // Helpers for getting default parameters
382
419
  LLAMA_API struct llama_model_params llama_model_default_params(void);
383
420
  LLAMA_API struct llama_context_params llama_context_default_params(void);
@@ -424,8 +461,8 @@ extern "C" {
424
461
 
425
462
  LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
426
463
 
427
- LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
428
- LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
464
+ LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
465
+ LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
429
466
 
430
467
  LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
431
468
  LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
@@ -464,24 +501,45 @@ extern "C" {
464
501
  // Get a llama model tensor
465
502
  LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
466
503
 
504
+ // Returns true if the model contains an encoder that requires llama_encode() call
505
+ LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
506
+
507
+ // For encoder-decoder models, this function returns id of the token that must be provided
508
+ // to the decoder to start generating output sequence. For other models, it returns -1.
509
+ LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
510
+
467
511
  // Returns 0 on success
468
512
  LLAMA_API uint32_t llama_model_quantize(
469
513
  const char * fname_inp,
470
514
  const char * fname_out,
471
515
  const llama_model_quantize_params * params);
472
516
 
473
- // Apply a LoRA adapter to a loaded model
474
- // path_base_model is the path to a higher quality model to use as a base for
475
- // the layers modified by the adapter. Can be NULL to use the current loaded model.
476
- // The model needs to be reloaded before applying a new adapter, otherwise the adapter
477
- // will be applied on top of the previous one
478
- // Returns 0 on success
479
- LLAMA_API int32_t llama_model_apply_lora_from_file(
480
- const struct llama_model * model,
481
- const char * path_lora,
482
- float scale,
483
- const char * path_base_model,
484
- int32_t n_threads);
517
+ // Load a LoRA adapter from file
518
+ // The loaded adapter will be associated to the given model, and will be free when the model is deleted
519
+ LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
520
+ struct llama_model * model,
521
+ const char * path_lora);
522
+
523
+ // Add a loaded LoRA adapter to given context
524
+ // This will not modify model's weight
525
+ LLAMA_API int32_t llama_lora_adapter_set(
526
+ struct llama_context * ctx,
527
+ struct llama_lora_adapter * adapter,
528
+ float scale);
529
+
530
+ // Remove a specific LoRA adapter from given context
531
+ // Return -1 if the adapter is not present in the context
532
+ LLAMA_API int32_t llama_lora_adapter_remove(
533
+ struct llama_context * ctx,
534
+ struct llama_lora_adapter * adapter);
535
+
536
+ // Remove all LoRA adapters from given context
537
+ LLAMA_API void llama_lora_adapter_clear(
538
+ struct llama_context * ctx);
539
+
540
+ // Manually free a LoRA adapter
541
+ // Note: loaded adapters will be free when the associated model is deleted
542
+ LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
485
543
 
486
544
  // Apply a loaded control vector to a llama_context, or if data is NULL, clear
487
545
  // the currently loaded vector.
@@ -631,10 +689,11 @@ extern "C" {
631
689
  // State / sessions
632
690
  //
633
691
 
634
- // Returns the maximum size in bytes of the state (rng, logits, embedding
635
- // and kv_cache) - will often be smaller after compacting tokens
636
- LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
637
- LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
692
+ // Returns the *actual* size in bytes of the state
693
+ // (rng, logits, embedding and kv_cache)
694
+ // Only use when saving the state, not when restoring it, otherwise the size may be too small.
695
+ LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
696
+ LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
638
697
  "use llama_state_get_size instead");
639
698
 
640
699
  // Copies the state to the specified destination address.
@@ -642,7 +701,8 @@ extern "C" {
642
701
  // Returns the number of bytes copied
643
702
  LLAMA_API size_t llama_state_get_data(
644
703
  struct llama_context * ctx,
645
- uint8_t * dst);
704
+ uint8_t * dst,
705
+ size_t size);
646
706
  LLAMA_API DEPRECATED(size_t llama_copy_state_data(
647
707
  struct llama_context * ctx,
648
708
  uint8_t * dst),
@@ -652,7 +712,8 @@ extern "C" {
652
712
  // Returns the number of bytes read
653
713
  LLAMA_API size_t llama_state_set_data(
654
714
  struct llama_context * ctx,
655
- const uint8_t * src);
715
+ const uint8_t * src,
716
+ size_t size);
656
717
  LLAMA_API DEPRECATED(size_t llama_set_state_data(
657
718
  struct llama_context * ctx,
658
719
  const uint8_t * src),
@@ -694,6 +755,7 @@ extern "C" {
694
755
  LLAMA_API size_t llama_state_seq_get_data(
695
756
  struct llama_context * ctx,
696
757
  uint8_t * dst,
758
+ size_t size,
697
759
  llama_seq_id seq_id);
698
760
 
699
761
  // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
@@ -703,6 +765,7 @@ extern "C" {
703
765
  LLAMA_API size_t llama_state_seq_set_data(
704
766
  struct llama_context * ctx,
705
767
  const uint8_t * src,
768
+ size_t size,
706
769
  llama_seq_id dest_seq_id);
707
770
 
708
771
  LLAMA_API size_t llama_state_seq_save_file(
@@ -749,6 +812,14 @@ extern "C" {
749
812
  // Frees a batch of tokens allocated with llama_batch_init()
750
813
  LLAMA_API void llama_batch_free(struct llama_batch batch);
751
814
 
815
+ // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
816
+ // Stores the encoder output internally for later use by the decoder cross-attention layers.
817
+ // 0 - success
818
+ // < 0 - error
819
+ LLAMA_API int32_t llama_encode(
820
+ struct llama_context * ctx,
821
+ struct llama_batch batch);
822
+
752
823
  // Positive return values does not mean a fatal error, but rather a warning.
753
824
  // 0 - success
754
825
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
@@ -768,6 +839,10 @@ extern "C" {
768
839
  // Get the number of threads used for prompt and batch processing (multiple token).
769
840
  LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
770
841
 
842
+ // Set whether the model is in embeddings mode or not
843
+ // If true, embeddings will be returned but logits will not
844
+ LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
845
+
771
846
  // Set whether to use causal attention or not
772
847
  // If set to true, the model will only attend to the past tokens
773
848
  LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
@@ -821,7 +896,7 @@ extern "C" {
821
896
 
822
897
  LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
823
898
 
824
- LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
899
+ LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
825
900
 
826
901
  // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
827
902
  LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
@@ -835,12 +910,13 @@ extern "C" {
835
910
  LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
836
911
  LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
837
912
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
913
+ LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
838
914
 
839
915
  // Returns -1 if unknown, 1 for true or 0 for false.
840
- LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
916
+ LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
841
917
 
842
918
  // Returns -1 if unknown, 1 for true or 0 for false.
843
- LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
919
+ LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
844
920
 
845
921
  // Codellama infill tokens
846
922
  LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@@ -856,6 +932,7 @@ extern "C" {
856
932
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
857
933
  /// @return Returns the number of tokens on success, no more than n_tokens_max
858
934
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
935
+ /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
859
936
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
860
937
  /// as plaintext. Does not insert a leading space.
861
938
  LLAMA_API int32_t llama_tokenize(
@@ -870,15 +947,35 @@ extern "C" {
870
947
  // Token Id -> Piece.
871
948
  // Uses the vocabulary in the provided context.
872
949
  // Does not write null terminator to the buffer.
873
- // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
950
+ // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
874
951
  // @param special If true, special tokens are rendered in the output.
875
952
  LLAMA_API int32_t llama_token_to_piece(
876
953
  const struct llama_model * model,
877
954
  llama_token token,
878
955
  char * buf,
879
956
  int32_t length,
957
+ int32_t lstrip,
880
958
  bool special);
881
959
 
960
+ /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
961
+ /// @param text The char pointer must be large enough to hold the resulting text.
962
+ /// @return Returns the number of chars/bytes on success, no more than text_len_max.
963
+ /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
964
+ /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
965
+ /// @param unparse_special If true, special tokens are rendered in the output.
966
+ LLAMA_API int32_t llama_detokenize(
967
+ const struct llama_model * model,
968
+ const llama_token * tokens,
969
+ int32_t n_tokens,
970
+ char * text,
971
+ int32_t text_len_max,
972
+ bool remove_special,
973
+ bool unparse_special);
974
+
975
+ //
976
+ // Chat templates
977
+ //
978
+
882
979
  /// Apply chat template. Inspired by hf apply_chat_template() on python.
883
980
  /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
884
981
  /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
@@ -902,6 +999,12 @@ extern "C" {
902
999
  // Grammar
903
1000
  //
904
1001
 
1002
+ /// Initialize a llama_grammar.
1003
+ ///
1004
+ /// @param rules The rule elements of the grammar to initialize.
1005
+ /// @param n_rules The number of rules.
1006
+ /// @param start_rule_index The index of the root rule (the starting point of the grammar).
1007
+ /// @return The initialized llama_grammar or nullptr if initialization failed.
905
1008
  LLAMA_API struct llama_grammar * llama_grammar_init(
906
1009
  const llama_grammar_element ** rules,
907
1010
  size_t n_rules,
@@ -911,6 +1014,23 @@ extern "C" {
911
1014
 
912
1015
  LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
913
1016
 
1017
+ /// @details Apply constraints from grammar
1018
+ LLAMA_API void llama_grammar_sample(
1019
+ const struct llama_grammar * grammar,
1020
+ const struct llama_context * ctx,
1021
+ llama_token_data_array * candidates);
1022
+ LLAMA_API DEPRECATED(void llama_sample_grammar(
1023
+ struct llama_context * ctx,
1024
+ llama_token_data_array * candidates,
1025
+ const struct llama_grammar * grammar),
1026
+ "use llama_grammar_sample instead");
1027
+
1028
+ /// @details Accepts the sampled token into the grammar
1029
+ LLAMA_API void llama_grammar_accept_token(
1030
+ struct llama_grammar * grammar,
1031
+ struct llama_context * ctx,
1032
+ llama_token token);
1033
+
914
1034
  //
915
1035
  // Sampling functions
916
1036
  //
@@ -992,12 +1112,6 @@ extern "C" {
992
1112
  llama_token_data_array * candidates,
993
1113
  float temp);
994
1114
 
995
- /// @details Apply constraints from grammar
996
- LLAMA_API void llama_sample_grammar(
997
- struct llama_context * ctx,
998
- llama_token_data_array * candidates,
999
- const struct llama_grammar * grammar);
1000
-
1001
1115
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1002
1116
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1003
1117
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1035,56 +1149,10 @@ extern "C" {
1035
1149
  struct llama_context * ctx,
1036
1150
  llama_token_data_array * candidates);
1037
1151
 
1038
- /// @details Accepts the sampled token into the grammar
1039
- LLAMA_API void llama_grammar_accept_token(
1040
- struct llama_context * ctx,
1041
- struct llama_grammar * grammar,
1042
- llama_token token);
1043
-
1044
1152
  //
1045
- // Beam search
1153
+ // Model split
1046
1154
  //
1047
1155
 
1048
- struct llama_beam_view {
1049
- const llama_token * tokens;
1050
-
1051
- size_t n_tokens;
1052
- float p; // Cumulative beam probability (renormalized relative to all beams)
1053
- bool eob; // Callback should set this to true when a beam is at end-of-beam.
1054
- };
1055
-
1056
- // Passed to beam_search_callback function.
1057
- // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
1058
- // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
1059
- // These pointers are valid only during the synchronous callback, so should not be saved.
1060
- struct llama_beams_state {
1061
- struct llama_beam_view * beam_views;
1062
-
1063
- size_t n_beams; // Number of elements in beam_views[].
1064
- size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
1065
- bool last_call; // True iff this is the last callback invocation.
1066
- };
1067
-
1068
- // Type of pointer to the beam_search_callback function.
1069
- // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
1070
- // passed back to beam_search_callback. This avoids having to use global variables in the callback.
1071
- typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
1072
-
1073
- /// @details Deterministically returns entire sentence constructed by a beam search.
1074
- /// @param ctx Pointer to the llama_context.
1075
- /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
1076
- /// @param callback_data A pointer that is simply passed back to callback.
1077
- /// @param n_beams Number of beams to use.
1078
- /// @param n_past Number of tokens already evaluated.
1079
- /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
1080
- LLAMA_API void llama_beam_search(
1081
- struct llama_context * ctx,
1082
- llama_beam_search_callback_fn_t callback,
1083
- void * callback_data,
1084
- size_t n_beams,
1085
- int32_t n_past,
1086
- int32_t n_predict);
1087
-
1088
1156
  /// @details Build a split GGUF final path for this chunk.
1089
1157
  /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
1090
1158
  // Returns the split_path length.
@@ -1123,38 +1191,45 @@ extern "C" {
1123
1191
 
1124
1192
  struct ggml_tensor;
1125
1193
 
1194
+ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
1195
+ struct llama_context * ctx
1196
+ );
1197
+
1126
1198
  struct llama_partial_utf8 {
1127
1199
  uint32_t value; // bit value so far (unshifted)
1128
1200
  int n_remain; // num bytes remaining; -1 indicates invalid sequence
1129
1201
  };
1130
1202
 
1131
- struct llama_grammar {
1132
- const std::vector<std::vector<llama_grammar_element>> rules;
1133
- std::vector<std::vector<const llama_grammar_element *>> stacks;
1134
-
1135
- // buffer for partially generated UTF-8 sequence from accepted tokens
1136
- llama_partial_utf8 partial_utf8;
1137
- };
1138
-
1139
1203
  struct llama_grammar_candidate {
1140
1204
  size_t index;
1141
1205
  const uint32_t * code_points;
1142
1206
  llama_partial_utf8 partial_utf8;
1143
1207
  };
1144
1208
 
1145
- const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
1146
- struct llama_context * ctx
1147
- );
1209
+ using llama_grammar_rule = std::vector< llama_grammar_element>;
1210
+ using llama_grammar_stack = std::vector<const llama_grammar_element *>;
1211
+
1212
+ using llama_grammar_rules = std::vector<llama_grammar_rule>;
1213
+ using llama_grammar_stacks = std::vector<llama_grammar_stack>;
1214
+ using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
1215
+
1216
+ const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
1217
+ llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
1148
1218
 
1149
1219
  void llama_grammar_accept(
1150
- const std::vector<std::vector<llama_grammar_element>> & rules,
1151
- const std::vector<std::vector<const llama_grammar_element *>> & stacks,
1152
- const uint32_t chr,
1153
- std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
1220
+ const llama_grammar_rules & rules,
1221
+ const llama_grammar_stacks & stacks,
1222
+ const uint32_t chr,
1223
+ llama_grammar_stacks & new_stacks);
1224
+
1225
+ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
1226
+ const llama_grammar_rules & rules,
1227
+ const llama_grammar_stack & stack,
1228
+ const llama_grammar_candidates & candidates);
1154
1229
 
1155
1230
  std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
1156
1231
  const std::string & src,
1157
- llama_partial_utf8 partial_start);
1232
+ llama_partial_utf8 partial_start);
1158
1233
 
1159
1234
  // Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
1160
1235
  // This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
@@ -0,0 +1 @@
1
+ root = true
@@ -0,0 +1,112 @@
1
+ ied 4 ½ months
2
+ __ggml_vocab_test__
3
+ Führer
4
+ __ggml_vocab_test__
5
+
6
+ __ggml_vocab_test__
7
+
8
+ __ggml_vocab_test__
9
+
10
+ __ggml_vocab_test__
11
+
12
+ __ggml_vocab_test__
13
+
14
+ __ggml_vocab_test__
15
+
16
+
17
+ __ggml_vocab_test__
18
+
19
+
20
+
21
+ __ggml_vocab_test__
22
+
23
+
24
+
25
+
26
+ __ggml_vocab_test__
27
+
28
+
29
+ __ggml_vocab_test__
30
+ Hello world
31
+ __ggml_vocab_test__
32
+ Hello world
33
+ __ggml_vocab_test__
34
+ Hello World
35
+ __ggml_vocab_test__
36
+ Hello World
37
+ __ggml_vocab_test__
38
+ Hello World!
39
+ __ggml_vocab_test__
40
+ Hello, world!
41
+ __ggml_vocab_test__
42
+ Hello, world!
43
+ __ggml_vocab_test__
44
+ this is 🦙.cpp
45
+ __ggml_vocab_test__
46
+ w048 7tuijk dsdfhu
47
+ __ggml_vocab_test__
48
+ нещо на Български
49
+ __ggml_vocab_test__
50
+ កាន់តែពិសេសអាចខលចេញ
51
+ __ggml_vocab_test__
52
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53
+ __ggml_vocab_test__
54
+ Hello
55
+ __ggml_vocab_test__
56
+ Hello
57
+ __ggml_vocab_test__
58
+ Hello
59
+ __ggml_vocab_test__
60
+ Hello
61
+ __ggml_vocab_test__
62
+ Hello
63
+ __ggml_vocab_test__
64
+ Hello
65
+ Hello
66
+ __ggml_vocab_test__
67
+ (
68
+ __ggml_vocab_test__
69
+
70
+ =
71
+ __ggml_vocab_test__
72
+ ' era
73
+ __ggml_vocab_test__
74
+ Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75
+ __ggml_vocab_test__
76
+ !!!!!!
77
+ __ggml_vocab_test__
78
+ 3
79
+ __ggml_vocab_test__
80
+ 33
81
+ __ggml_vocab_test__
82
+ 333
83
+ __ggml_vocab_test__
84
+ 3333
85
+ __ggml_vocab_test__
86
+ 33333
87
+ __ggml_vocab_test__
88
+ 333333
89
+ __ggml_vocab_test__
90
+ 3333333
91
+ __ggml_vocab_test__
92
+ 33333333
93
+ __ggml_vocab_test__
94
+ 333333333
95
+ __ggml_vocab_test__
96
+ Cửa Việt
97
+ __ggml_vocab_test__
98
+ discards
99
+ __ggml_vocab_test__
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
112
+ __ggml_vocab_test__