@fugood/llama.node 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CMakeLists.txt +6 -3
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +3 -3
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  24. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  25. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  26. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  27. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  28. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  29. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  31. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  32. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  33. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  34. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  35. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  36. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  37. package/src/llama.cpp/CMakeLists.txt +91 -1245
  38. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  39. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  40. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  41. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  42. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  43. package/src/llama.cpp/common/common.cpp +1116 -877
  44. package/src/llama.cpp/common/common.h +191 -77
  45. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  46. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  47. package/src/llama.cpp/common/log.h +1 -1
  48. package/src/llama.cpp/common/ngram-cache.h +10 -3
  49. package/src/llama.cpp/common/sampling.cpp +19 -10
  50. package/src/llama.cpp/docs/build.md +353 -0
  51. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  52. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  54. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  56. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  58. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  60. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  61. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  62. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  63. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  64. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  65. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  66. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  67. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  68. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  69. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  71. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  72. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  73. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  75. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  76. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  77. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  79. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  80. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  87. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  88. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  90. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  91. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  92. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  94. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  95. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  96. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  97. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  98. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  99. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  100. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  102. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  103. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  104. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  105. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  106. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  107. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  108. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  110. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  111. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  112. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  113. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  114. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  115. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  116. package/src/llama.cpp/examples/main/main.cpp +98 -75
  117. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  118. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  119. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  120. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  121. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  122. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  123. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  124. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  125. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  126. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  127. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  128. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  129. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  130. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  131. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  132. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  133. package/src/llama.cpp/examples/server/server.cpp +274 -671
  134. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  135. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  136. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  137. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  138. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  139. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  140. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  141. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  142. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  143. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  144. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  145. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  146. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  147. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  148. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  149. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  150. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  151. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  152. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  153. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  154. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  155. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  156. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  157. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  159. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  160. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  161. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  162. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  163. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  178. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  179. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  180. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  181. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  182. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  183. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  184. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  208. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  209. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  210. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  211. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  212. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  214. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  215. package/src/llama.cpp/models/.editorconfig +1 -0
  216. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  217. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  221. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  224. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  230. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  233. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  237. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  243. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  246. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  249. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  252. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  255. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  258. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  259. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  260. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  261. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  263. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  264. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  265. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  266. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  267. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  268. package/src/llama.cpp/requirements.txt +5 -4
  269. package/src/llama.cpp/scripts/build-info.sh +30 -0
  270. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  271. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  272. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  273. package/src/llama.cpp/src/llama-grammar.h +39 -0
  274. package/src/llama.cpp/src/llama-impl.h +26 -0
  275. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  276. package/src/llama.cpp/src/llama-sampling.h +56 -0
  277. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  278. package/src/llama.cpp/src/llama-vocab.h +130 -0
  279. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  280. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  281. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  282. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  283. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  284. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  285. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  286. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  287. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  289. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  290. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  291. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  292. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  293. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  294. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  295. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  296. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  297. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  298. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  299. package/bin/darwin/arm64/default.metallib +0 -0
  300. package/bin/darwin/x64/default.metallib +0 -0
  301. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  302. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  303. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  304. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  305. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  306. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  307. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  308. package/src/llama.cpp/ggml-opencl.h +0 -36
  309. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  310. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  311. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  314. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  315. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  316. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  317. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  318. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  319. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -4,8 +4,9 @@
4
4
  # Package versions must stay compatible across all top-level python scripts.
5
5
  #
6
6
 
7
- -r ./requirements/requirements-convert.txt
7
+ -r ./requirements/requirements-convert_legacy_llama.txt
8
8
 
9
- -r ./requirements/requirements-convert-hf-to-gguf.txt
10
- -r ./requirements/requirements-convert-hf-to-gguf-update.txt
11
- -r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
9
+ -r ./requirements/requirements-convert_hf_to_gguf.txt
10
+ -r ./requirements/requirements-convert_hf_to_gguf_update.txt
11
+ -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
12
+ -r ./requirements/requirements-convert_lora_to_gguf.txt
@@ -0,0 +1,30 @@
1
+ #!/bin/sh
2
+
3
+ CC=$1
4
+
5
+ build_number="0"
6
+ build_commit="unknown"
7
+ build_compiler="unknown"
8
+ build_target="unknown"
9
+
10
+ if out=$(git rev-list --count HEAD); then
11
+ # git is broken on WSL so we need to strip extra newlines
12
+ build_number=$(printf '%s' "$out" | tr -d '\n')
13
+ fi
14
+
15
+ if out=$(git rev-parse --short HEAD); then
16
+ build_commit=$(printf '%s' "$out" | tr -d '\n')
17
+ fi
18
+
19
+ if out=$($CC --version | head -1); then
20
+ build_compiler=$out
21
+ fi
22
+
23
+ if out=$($CC -dumpmachine); then
24
+ build_target=$out
25
+ fi
26
+
27
+ echo "int LLAMA_BUILD_NUMBER = ${build_number};"
28
+ echo "char const *LLAMA_COMMIT = \"${build_commit}\";"
29
+ echo "char const *LLAMA_COMPILER = \"${build_compiler}\";"
30
+ echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";"
@@ -0,0 +1,19 @@
1
+ :: MIT license
2
+ :: Copyright (C) 2024 Intel Corporation
3
+ :: SPDX-License-Identifier: MIT
4
+
5
+
6
+ set URL=%1
7
+ set COMPONENTS=%2
8
+
9
+ curl.exe --output %TEMP%\webimage.exe --url %URL% --retry 5 --retry-delay 5
10
+ start /b /wait %TEMP%\webimage.exe -s -x -f webimage_extracted --log extract.log
11
+ del %TEMP%\webimage.exe
12
+ if "%COMPONENTS%"=="" (
13
+ webimage_extracted\bootstrapper.exe -s --action install --eula=accept -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=.
14
+ ) else (
15
+ webimage_extracted\bootstrapper.exe -s --action install --components=%COMPONENTS% --eula=accept -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=.
16
+ )
17
+ set installer_exit_code=%ERRORLEVEL%
18
+ rd /s/q "webimage_extracted"
19
+ exit /b %installer_exit_code%
@@ -0,0 +1,33 @@
1
+ # TODO: should not use this
2
+ if (WIN32)
3
+ if (BUILD_SHARED_LIBS)
4
+ set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
5
+ endif()
6
+ endif()
7
+
8
+ #
9
+ # libraries
10
+ #
11
+
12
+ # llama
13
+
14
+ add_library(llama
15
+ ../include/llama.h
16
+ llama.cpp
17
+ llama-vocab.cpp
18
+ llama-grammar.cpp
19
+ llama-sampling.cpp
20
+ unicode.h
21
+ unicode.cpp
22
+ unicode-data.cpp
23
+ )
24
+
25
+ target_include_directories(llama PUBLIC . ../include)
26
+ target_compile_features (llama PUBLIC cxx_std_11) # don't bump
27
+
28
+ target_link_libraries(llama PUBLIC ggml)
29
+
30
+ if (BUILD_SHARED_LIBS)
31
+ set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
32
+ target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
33
+ endif()
@@ -0,0 +1,539 @@
1
+ #include "llama-grammar.h"
2
+
3
+ #include "llama-vocab.h"
4
+ #include "llama-sampling.h"
5
+
6
+ #include <algorithm>
7
+
8
+ // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
9
+ // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
10
+ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
11
+ const std::string & src,
12
+ llama_partial_utf8 partial_start) {
13
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
14
+ const char * pos = src.c_str();
15
+ std::vector<uint32_t> code_points;
16
+
17
+ // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
18
+ code_points.reserve(src.size() + 1);
19
+ uint32_t value = partial_start.value;
20
+ int n_remain = partial_start.n_remain;
21
+
22
+ // continue previous decode, if applicable
23
+ while (*pos != 0 && n_remain > 0) {
24
+ uint8_t next_byte = static_cast<uint8_t>(*pos);
25
+ if ((next_byte >> 6) != 2) {
26
+ // invalid sequence, abort
27
+ code_points.push_back(0);
28
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
29
+ }
30
+ value = (value << 6) + (next_byte & 0x3F);
31
+ ++pos;
32
+ --n_remain;
33
+ }
34
+
35
+ if (partial_start.n_remain > 0 && n_remain == 0) {
36
+ code_points.push_back(value);
37
+ }
38
+
39
+ // decode any subsequent utf-8 sequences, which may end in an incomplete one
40
+ while (*pos != 0) {
41
+ uint8_t first_byte = static_cast<uint8_t>(*pos);
42
+ uint8_t highbits = first_byte >> 4;
43
+ n_remain = lookup[highbits] - 1;
44
+
45
+ if (n_remain < 0) {
46
+ // invalid sequence, abort
47
+ code_points.clear();
48
+ code_points.push_back(0);
49
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
50
+ }
51
+
52
+ uint8_t mask = (1 << (7 - n_remain)) - 1;
53
+ value = first_byte & mask;
54
+
55
+ ++pos;
56
+ while (*pos != 0 && n_remain > 0) {
57
+ value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
58
+ ++pos;
59
+ --n_remain;
60
+ }
61
+ if (n_remain == 0) {
62
+ code_points.push_back(value);
63
+ }
64
+ }
65
+ code_points.push_back(0);
66
+
67
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
68
+ }
69
+
70
+ const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) {
71
+ return grammar->rules;
72
+ }
73
+
74
+ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) {
75
+ return grammar->stacks;
76
+ }
77
+
78
+ // returns true iff pos points to the end of one of the definitions of a rule
79
+ static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
80
+ switch (pos->type) {
81
+ case LLAMA_GRETYPE_END: return true; // NOLINT
82
+ case LLAMA_GRETYPE_ALT: return true; // NOLINT
83
+ default: return false;
84
+ }
85
+ }
86
+
87
+ // returns true iff chr satisfies the char range at pos (regular or inverse range)
88
+ // asserts that pos is pointing to a char range element
89
+ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
90
+ const llama_grammar_element * pos,
91
+ const uint32_t chr) {
92
+
93
+ bool found = false;
94
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
95
+
96
+ GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
97
+
98
+ do {
99
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
100
+ // inclusive range, e.g. [a-z]
101
+ found = found || (pos->value <= chr && chr <= pos[1].value);
102
+ pos += 2;
103
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
104
+ // Any character matches "."
105
+ found = true;
106
+ pos += 1;
107
+ } else {
108
+ // exact char match, e.g. [a] or "a"
109
+ found = found || pos->value == chr;
110
+ pos += 1;
111
+ }
112
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
113
+
114
+ return std::make_pair(found == is_positive_char, pos);
115
+ }
116
+
117
+ // returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
118
+ // range at pos (regular or inverse range)
119
+ // asserts that pos is pointing to a char range element
120
+ static bool llama_grammar_match_partial_char(
121
+ const llama_grammar_element * pos,
122
+ const llama_partial_utf8 partial_utf8) {
123
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
124
+ GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
125
+
126
+ uint32_t partial_value = partial_utf8.value;
127
+ int n_remain = partial_utf8.n_remain;
128
+
129
+ // invalid sequence or 7-bit char split across 2 bytes (overlong)
130
+ if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
131
+ return false;
132
+ }
133
+
134
+ // range of possible code points this partial UTF-8 sequence could complete to
135
+ uint32_t low = partial_value << (n_remain * 6);
136
+ uint32_t high = low | ((1 << (n_remain * 6)) - 1);
137
+
138
+ if (low == 0) {
139
+ if (n_remain == 2) {
140
+ low = 1 << 11;
141
+ } else if (n_remain == 3) {
142
+ low = 1 << 16;
143
+ }
144
+ }
145
+
146
+ do {
147
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
148
+ // inclusive range, e.g. [a-z]
149
+ if (pos->value <= high && low <= pos[1].value) {
150
+ return is_positive_char;
151
+ }
152
+ pos += 2;
153
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
154
+ // Any character matches "."
155
+ return true;
156
+ } else {
157
+ // exact char match, e.g. [a] or "a"
158
+ if (low <= pos->value && pos->value <= high) {
159
+ return is_positive_char;
160
+ }
161
+ pos += 1;
162
+ }
163
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
164
+
165
+ return !is_positive_char;
166
+ }
167
+
168
+ // transforms a grammar pushdown stack into N possible stacks, all ending
169
+ // at a character range (terminal element)
170
+ static void llama_grammar_advance_stack(
171
+ const llama_grammar_rules & rules,
172
+ const llama_grammar_stack & stack,
173
+ llama_grammar_stacks & new_stacks) {
174
+ if (stack.empty()) {
175
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
176
+ new_stacks.emplace_back(stack);
177
+ }
178
+ return;
179
+ }
180
+
181
+ const llama_grammar_element * pos = stack.back();
182
+
183
+ switch (pos->type) {
184
+ case LLAMA_GRETYPE_RULE_REF: {
185
+ const size_t rule_id = static_cast<size_t>(pos->value);
186
+ const llama_grammar_element * subpos = rules[rule_id].data();
187
+ do {
188
+ // init new stack without the top (pos)
189
+ llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
190
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
191
+ // if this rule ref is followed by another element, add that to stack
192
+ new_stack.push_back(pos + 1);
193
+ }
194
+ if (!llama_grammar_is_end_of_sequence(subpos)) {
195
+ // if alternate is nonempty, add to stack
196
+ new_stack.push_back(subpos);
197
+ }
198
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
199
+ while (!llama_grammar_is_end_of_sequence(subpos)) {
200
+ // scan to end of alternate def
201
+ subpos++;
202
+ }
203
+ if (subpos->type == LLAMA_GRETYPE_ALT) {
204
+ // there's another alternate def of this rule to process
205
+ subpos++;
206
+ } else {
207
+ break;
208
+ }
209
+ } while (true);
210
+ break;
211
+ }
212
+ case LLAMA_GRETYPE_CHAR:
213
+ case LLAMA_GRETYPE_CHAR_NOT:
214
+ case LLAMA_GRETYPE_CHAR_ANY:
215
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
216
+ // only add the stack if it's not a duplicate of one we already have
217
+ new_stacks.emplace_back(stack);
218
+ }
219
+ break;
220
+ default:
221
+ // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
222
+ // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
223
+ // those
224
+ GGML_ABORT("fatal error");
225
+ }
226
+ }
227
+
228
+ // takes a set of possible pushdown stacks on a grammar, which are required to
229
+ // be positioned at a character range (see `llama_grammar_advance_stack`), and
230
+ // produces the N possible stacks if the given char is accepted at those
231
+ // positions
232
+ void llama_grammar_accept(
233
+ const llama_grammar_rules & rules,
234
+ const llama_grammar_stacks & stacks,
235
+ const uint32_t chr,
236
+ llama_grammar_stacks & new_stacks) {
237
+ new_stacks.clear();
238
+
239
+ for (const auto & stack : stacks) {
240
+ if (stack.empty()) {
241
+ continue;
242
+ }
243
+
244
+ auto match = llama_grammar_match_char(stack.back(), chr);
245
+ if (match.first) {
246
+ const llama_grammar_element * pos = match.second;
247
+
248
+ // update top of stack to next element, if any
249
+ llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
250
+ if (!llama_grammar_is_end_of_sequence(pos)) {
251
+ new_stack.push_back(pos);
252
+ }
253
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
254
+ }
255
+ }
256
+ }
257
+
258
+ static llama_grammar_candidates llama_grammar_reject_candidates(
259
+ const llama_grammar_rules & rules,
260
+ const llama_grammar_stacks & stacks,
261
+ const llama_grammar_candidates & candidates) {
262
+ GGML_ASSERT(!stacks.empty()); // REVIEW
263
+
264
+ if (candidates.empty()) {
265
+ return {};
266
+ }
267
+
268
+ auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
269
+
270
+ for (size_t i = 1, size = stacks.size(); i < size; ++i) {
271
+ rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
272
+ }
273
+ return rejects;
274
+ }
275
+
276
+ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
277
+ const llama_grammar_rules & rules,
278
+ const llama_grammar_stack & stack,
279
+ const llama_grammar_candidates & candidates) {
280
+
281
+ llama_grammar_candidates rejects;
282
+ rejects.reserve(candidates.size());
283
+
284
+ if (stack.empty()) {
285
+ for (const auto & tok : candidates) {
286
+ if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
287
+ rejects.push_back(tok);
288
+ }
289
+ }
290
+ return rejects;
291
+ }
292
+
293
+ const llama_grammar_element * stack_pos = stack.back();
294
+
295
+ llama_grammar_candidates next_candidates;
296
+ next_candidates.reserve(candidates.size());
297
+
298
+ for (const auto & tok : candidates) {
299
+ if (*tok.code_points == 0) {
300
+ // reached end of full codepoints in token, reject iff it ended in a partial sequence
301
+ // that cannot satisfy this position in grammar
302
+ if (tok.partial_utf8.n_remain != 0 &&
303
+ !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
304
+ rejects.push_back(tok);
305
+ }
306
+ } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
307
+ next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
308
+ } else {
309
+ rejects.push_back(tok);
310
+ }
311
+ }
312
+
313
+ const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
314
+
315
+ // update top of stack to next element, if any
316
+ llama_grammar_stack stack_after(stack.begin(), stack.end() - 1);
317
+ if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
318
+ stack_after.push_back(stack_pos_after);
319
+ }
320
+ llama_grammar_stacks next_stacks;
321
+ llama_grammar_advance_stack(rules, stack_after, next_stacks);
322
+
323
+ auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
324
+ for (const auto & tok : next_rejects) {
325
+ rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
326
+ }
327
+
328
+ return rejects;
329
+ }
330
+
331
+ static bool llama_grammar_detect_left_recursion(
332
+ const llama_grammar_rules & rules,
333
+ size_t rule_index,
334
+ std::vector<bool> * rules_visited,
335
+ std::vector<bool> * rules_in_progress,
336
+ std::vector<bool> * rules_may_be_empty) {
337
+ if ((*rules_in_progress)[rule_index]) {
338
+ return true;
339
+ }
340
+
341
+ (*rules_in_progress)[rule_index] = true;
342
+
343
+ const llama_grammar_rule & rule = rules[rule_index];
344
+
345
+ // First check if the rule might produce the empty string. This could be done combined with the second
346
+ // step but it's more readable as two steps.
347
+ bool at_rule_start = true;
348
+ for (size_t i = 0; i < rule.size(); i++) {
349
+ if (llama_grammar_is_end_of_sequence(&rule[i])) {
350
+ if (at_rule_start) {
351
+ (*rules_may_be_empty)[rule_index] = true;
352
+ break;
353
+ }
354
+ at_rule_start = true;
355
+ } else {
356
+ at_rule_start = false;
357
+ }
358
+ }
359
+
360
+ // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
361
+ // be empty)
362
+ bool recurse_into_nonterminal = true;
363
+ for (size_t i = 0; i < rule.size(); i++) {
364
+ if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
365
+ if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
366
+ return true;
367
+ }
368
+ if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
369
+ recurse_into_nonterminal = false;
370
+ }
371
+ } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
372
+ recurse_into_nonterminal = true;
373
+ } else {
374
+ recurse_into_nonterminal = false;
375
+ }
376
+ }
377
+
378
+ (*rules_in_progress)[rule_index] = false;
379
+ (*rules_visited)[rule_index] = true;
380
+ return false;
381
+ }
382
+
383
+ //
384
+ // grammar - external
385
+ //
386
+
387
+ struct llama_grammar * llama_grammar_init_impl(
388
+ const llama_grammar_element ** rules,
389
+ size_t n_rules,
390
+ size_t start_rule_index) {
391
+ const llama_grammar_element * pos;
392
+
393
+ // copy rule definitions into vectors
394
+ llama_grammar_rules vec_rules(n_rules);
395
+ for (size_t i = 0; i < n_rules; i++) {
396
+ for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
397
+ vec_rules[i].push_back(*pos);
398
+ }
399
+ vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
400
+ }
401
+
402
+ // Check for left recursion
403
+ std::vector<bool> rules_visited(n_rules);
404
+ std::vector<bool> rules_in_progress(n_rules);
405
+ std::vector<bool> rules_may_be_empty(n_rules);
406
+ for (size_t i = 0; i < n_rules; i++) {
407
+ if (rules_visited[i]) {
408
+ continue;
409
+ }
410
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
411
+ LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
412
+ return nullptr;
413
+ }
414
+ }
415
+
416
+ // loop over alternates of start rule to build initial stacks
417
+ llama_grammar_stacks stacks;
418
+ pos = vec_rules[start_rule_index].data();
419
+ do {
420
+ llama_grammar_stack stack;
421
+ if (!llama_grammar_is_end_of_sequence(pos)) {
422
+ // if alternate is nonempty, add to stack
423
+ stack.push_back(pos);
424
+ }
425
+ llama_grammar_advance_stack(vec_rules, stack, stacks);
426
+ while (!llama_grammar_is_end_of_sequence(pos)) {
427
+ // scan to end of alternate def
428
+ pos++;
429
+ }
430
+ if (pos->type == LLAMA_GRETYPE_ALT) {
431
+ // there's another alternate def of this rule to process
432
+ pos++;
433
+ } else {
434
+ break;
435
+ }
436
+ } while (true);
437
+
438
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
439
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
440
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
441
+ return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
442
+ }
443
+
444
+ void llama_grammar_free_impl(struct llama_grammar * grammar) {
445
+ delete grammar;
446
+ }
447
+
448
+ struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar) {
449
+ llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
450
+
451
+ // redirect elements in stacks to point to new rules
452
+ for (size_t is = 0; is < result->stacks.size(); is++) {
453
+ for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
454
+ for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
455
+ for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
456
+ if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
457
+ result->stacks[is][ie] = &result->rules[ir0][ir1];
458
+ }
459
+ }
460
+ }
461
+ }
462
+ }
463
+
464
+ return result;
465
+ }
466
+
467
+ void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token_data_array * candidates) {
468
+ GGML_ASSERT(grammar);
469
+ GGML_ASSERT(vocab);
470
+
471
+ int64_t t_start_sample_us = ggml_time_us();
472
+
473
+ bool allow_eog = false;
474
+ for (const auto & stack : grammar->stacks) {
475
+ if (stack.empty()) {
476
+ allow_eog = true;
477
+ break;
478
+ }
479
+ }
480
+
481
+ std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
482
+ candidates_decoded.reserve(candidates->size);
483
+
484
+ llama_grammar_candidates candidates_grammar;
485
+ candidates_grammar.reserve(candidates->size);
486
+
487
+ for (size_t i = 0; i < candidates->size; ++i) {
488
+ const llama_token id = candidates->data[i].id;
489
+ const std::string & piece = vocab->cache_token_to_piece.at(id);
490
+
491
+ if (llama_token_is_eog_impl(*vocab, id)) {
492
+ if (!allow_eog) {
493
+ candidates->data[i].logit = -INFINITY;
494
+ }
495
+ } else if (piece.empty() || piece[0] == 0) {
496
+ candidates->data[i].logit = -INFINITY;
497
+ } else {
498
+ candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
499
+ candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
500
+ }
501
+ }
502
+
503
+ const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
504
+ for (const auto & reject : rejects) {
505
+ candidates->data[reject.index].logit = -INFINITY;
506
+ }
507
+
508
+ smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
509
+ }
510
+
511
+ void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token token) {
512
+ const int64_t t_start_sample_us = ggml_time_us();
513
+
514
+ if (llama_token_is_eog_impl(*vocab, token)) {
515
+ for (const auto & stack : grammar->stacks) {
516
+ if (stack.empty()) {
517
+ return;
518
+ }
519
+ }
520
+ GGML_ABORT("fatal error");
521
+ }
522
+
523
+ const std::string & piece = vocab->cache_token_to_piece.at(token);
524
+
525
+ // Note terminating 0 in decoded string
526
+ const auto decoded = decode_utf8(piece, grammar->partial_utf8);
527
+ const auto & code_points = decoded.first;
528
+
529
+ llama_grammar_stacks tmp_new_stacks;
530
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
531
+ llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
532
+ grammar->stacks = tmp_new_stacks;
533
+ }
534
+
535
+ grammar->partial_utf8 = decoded.second;
536
+ GGML_ASSERT(!grammar->stacks.empty());
537
+
538
+ smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
539
+ }
@@ -0,0 +1,39 @@
1
+ #pragma once
2
+
3
+ #include "llama-impl.h"
4
+
5
+ struct llama_vocab;
6
+ struct llama_sampling;
7
+
8
+ struct llama_grammar {
9
+ const llama_grammar_rules rules;
10
+ llama_grammar_stacks stacks;
11
+
12
+ // buffer for partially generated UTF-8 sequence from accepted tokens
13
+ llama_partial_utf8 partial_utf8;
14
+ };
15
+
16
+ //
17
+ // internal API
18
+ //
19
+
20
+ struct llama_grammar * llama_grammar_init_impl(
21
+ const llama_grammar_element ** rules,
22
+ size_t n_rules,
23
+ size_t start_rule_index);
24
+
25
+ void llama_grammar_free_impl(struct llama_grammar * grammar);
26
+
27
+ struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar);
28
+
29
+ void llama_grammar_sample_impl(
30
+ const struct llama_grammar * grammar,
31
+ const struct llama_vocab * vocab,
32
+ const struct llama_sampling * smpl,
33
+ llama_token_data_array * candidates);
34
+
35
+ void llama_grammar_accept_token_impl(
36
+ struct llama_grammar * grammar,
37
+ const struct llama_vocab * vocab,
38
+ const struct llama_sampling * smpl,
39
+ llama_token token);