@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
package/CMakeLists.txt CHANGED
@@ -53,8 +53,6 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release")
53
53
  endif()
54
54
  endif()
55
55
 
56
- include_directories(${CMAKE_JS_INC})
57
-
58
56
  # flags: -fPIC
59
57
  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
60
58
 
@@ -76,6 +74,11 @@ add_custom_target(
76
74
  set(LLAMA_STATIC ON CACHE BOOL "Build llama as static library")
77
75
  add_subdirectory("src/llama.cpp")
78
76
 
77
+ include_directories(
78
+ ${CMAKE_JS_INC}
79
+ "src/llama.cpp"
80
+ )
81
+
79
82
  file(
80
83
  GLOB SOURCE_FILES
81
84
  "src/addons.cc"
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.ts CHANGED
@@ -1,5 +1,10 @@
1
1
  import * as path from 'path'
2
2
 
3
+ export type ChatMessage = {
4
+ role: string
5
+ text: string
6
+ }
7
+
3
8
  export type LlamaModelOptions = {
4
9
  model: string
5
10
  embedding?: boolean
@@ -12,7 +17,8 @@ export type LlamaModelOptions = {
12
17
  }
13
18
 
14
19
  export type LlamaCompletionOptions = {
15
- prompt: string
20
+ messages?: ChatMessage[]
21
+ prompt?: string
16
22
  n_samples?: number
17
23
  temperature?: number
18
24
  top_k?: number
@@ -48,6 +54,7 @@ export type EmbeddingResult = {
48
54
  export interface LlamaContext {
49
55
  new (options: LlamaModelOptions): LlamaContext
50
56
  getSystemInfo(): string
57
+ getFormattedChat(messages: ChatMessage[]): string
51
58
  completion(options: LlamaCompletionOptions, callback?: (token: LlamaCompletionToken) => void): Promise<LlamaCompletionResult>
52
59
  stopCompletion(): void
53
60
  tokenize(text: string): Promise<TokenizeResult>
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.2.2",
4
+ "version": "0.3.0",
5
5
  "description": "Llama.cpp for Node.js",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -1,20 +1,20 @@
1
- diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
2
- index b9449be0..cfa0f774 100644
3
- --- a/ggml-vulkan.cpp
4
- +++ b/ggml-vulkan.cpp
5
- @@ -525,9 +525,15 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
1
+ diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
2
+ index fa68360b..f9ff7b5d 100644
3
+ --- a/ggml/src/ggml-vulkan.cpp
4
+ +++ b/ggml/src/ggml-vulkan.cpp
5
+ @@ -617,9 +617,15 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
6
6
  vk::PipelineCreateFlags(),
7
7
  pipeline_shader_create_info,
8
8
  pipeline->layout);
9
- - pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
9
+ - pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
10
10
 
11
- - ctx->device->pipelines.push_back(pipeline);
11
+ - device->pipelines.push_back(pipeline);
12
12
  + try {
13
- + pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
14
- + ctx->device->pipelines.push_back(pipeline);
15
- + } catch (vk::UnknownError const&) {
16
- + std::cerr << "ggml_vk_create_pipeline: Failed to create pipeline " << name << std::endl;
17
- + ggml_vk_destroy_pipeline(ctx->device->device, pipeline);
13
+ + pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
14
+ + device->pipelines.push_back(pipeline);
15
+ + } catch(vk::UnknownError const&) {
16
+ + VK_LOG_DEBUG("Failed to create pipeline " << name);
17
+ + ggml_vk_destroy_pipeline(device->device, pipeline);
18
18
  + pipeline.reset();
19
19
  + }
20
20
  }
@@ -8,7 +8,7 @@ DetokenizeWorker::DetokenizeWorker(const Napi::CallbackInfo &info,
8
8
  _tokens(std::move(tokens)) {}
9
9
 
10
10
  void DetokenizeWorker::Execute() {
11
- const auto text = ::llama_detokenize_bpe(_sess->context(), _tokens);
11
+ const auto text = ::llama_detokenize(_sess->context(), _tokens);
12
12
  _text = std::move(text);
13
13
  }
14
14
 
@@ -7,12 +7,27 @@
7
7
  #include "SaveSessionWorker.h"
8
8
  #include "TokenizeWorker.h"
9
9
 
10
+ std::vector<llama_chat_msg> get_messages(Napi::Array messages) {
11
+ std::vector<llama_chat_msg> chat;
12
+ for (size_t i = 0; i < messages.Length(); i++) {
13
+ auto message = messages.Get(i).As<Napi::Object>();
14
+ chat.push_back({
15
+ get_option<std::string>(message, "role", ""),
16
+ get_option<std::string>(message, "content", ""),
17
+ });
18
+ }
19
+ return std::move(chat);
20
+ }
21
+
10
22
  void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
11
23
  Napi::Function func = DefineClass(
12
24
  env, "LlamaContext",
13
25
  {InstanceMethod<&LlamaContext::GetSystemInfo>(
14
26
  "getSystemInfo",
15
27
  static_cast<napi_property_attributes>(napi_enumerable)),
28
+ InstanceMethod<&LlamaContext::GetFormattedChat>(
29
+ "getFormattedChat",
30
+ static_cast<napi_property_attributes>(napi_enumerable)),
16
31
  InstanceMethod<&LlamaContext::Completion>(
17
32
  "completion",
18
33
  static_cast<napi_property_attributes>(napi_enumerable)),
@@ -89,6 +104,17 @@ Napi::Value LlamaContext::GetSystemInfo(const Napi::CallbackInfo &info) {
89
104
  return Napi::String::New(info.Env(), _info);
90
105
  }
91
106
 
107
+ // getFormattedChat(messages: [{ role: string, content: string }]): string
108
+ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
109
+ Napi::Env env = info.Env();
110
+ if (info.Length() < 1 || !info[0].IsArray()) {
111
+ Napi::TypeError::New(env, "Array expected").ThrowAsJavaScriptException();
112
+ }
113
+ auto messages = info[0].As<Napi::Array>();
114
+ auto formatted = llama_chat_apply_template(_sess->model(), "", get_messages(messages), true);
115
+ return Napi::String::New(env, formatted);
116
+ }
117
+
92
118
  // completion(options: LlamaCompletionOptions, onToken?: (token: string) =>
93
119
  // void): Promise<LlamaCompletionResult>
94
120
  Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
@@ -110,7 +136,13 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
110
136
  auto options = info[0].As<Napi::Object>();
111
137
 
112
138
  gpt_params params = _sess->params();
113
- params.prompt = get_option<std::string>(options, "prompt", "");
139
+ if (options.Has("messages") && options.Get("messages").IsArray()) {
140
+ auto messages = options.Get("messages").As<Napi::Array>();
141
+ auto formatted = llama_chat_apply_template(_sess->model(), "", get_messages(messages), true);
142
+ params.prompt = formatted;
143
+ } else {
144
+ params.prompt = get_option<std::string>(options, "prompt", "");
145
+ }
114
146
  if (params.prompt.empty()) {
115
147
  Napi::TypeError::New(env, "Prompt is required")
116
148
  .ThrowAsJavaScriptException();
@@ -9,6 +9,7 @@ public:
9
9
 
10
10
  private:
11
11
  Napi::Value GetSystemInfo(const Napi::CallbackInfo &info);
12
+ Napi::Value GetFormattedChat(const Napi::CallbackInfo &info);
12
13
  Napi::Value Completion(const Napi::CallbackInfo &info);
13
14
  void StopCompletion(const Napi::CallbackInfo &info);
14
15
  Napi::Value Tokenize(const Napi::CallbackInfo &info);
@@ -15,6 +15,7 @@ void LoadSessionWorker::Execute() {
15
15
  tokens.capacity(), &count)) {
16
16
  SetError("Failed to load session");
17
17
  }
18
+ tokens.resize(count);
18
19
  _sess->set_tokens(std::move(tokens));
19
20
  _sess->get_mutex().unlock();
20
21
  }
@@ -0,0 +1,310 @@
1
+ # Benchmark
2
+ name: Benchmark
3
+
4
+ on:
5
+ workflow_dispatch:
6
+ inputs:
7
+ gpu-series:
8
+ description: 'Azure GPU series to run with'
9
+ required: true
10
+ type: choice
11
+ options:
12
+ - Standard_NC4as_T4_v3
13
+ - Standard_NC24ads_A100_v4
14
+ - Standard_NC80adis_H100_v5
15
+ sha:
16
+ description: 'Commit SHA1 to build'
17
+ required: false
18
+ type: string
19
+ duration:
20
+ description: 'Duration of the bench'
21
+ type: string
22
+ default: 10m
23
+
24
+ push:
25
+ branches:
26
+ - master
27
+ paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
28
+ pull_request_target:
29
+ types: [opened, synchronize, reopened]
30
+ paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
31
+ schedule:
32
+ - cron: '04 2 * * *'
33
+
34
+ concurrency:
35
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
36
+ cancel-in-progress: true
37
+
38
+ jobs:
39
+ bench-server-baseline:
40
+ runs-on: Standard_NC4as_T4_v3
41
+ env:
42
+ RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
43
+ N_USERS: 8
44
+ DURATION: 10m
45
+
46
+ strategy:
47
+ matrix:
48
+ model: [phi-2]
49
+ ftype: [q4_0, q8_0, f16]
50
+ include:
51
+ - model: phi-2
52
+ ftype: q4_0
53
+ pr_comment_enabled: "true"
54
+
55
+ if: |
56
+ inputs.gpu-series == 'Standard_NC4as_T4_v3'
57
+ || (
58
+ github.event_name == 'schedule'
59
+ && github.ref_name == 'master'
60
+ && github.repository_owner == 'ggerganov'
61
+ )
62
+ || github.event_name == 'pull_request_target'
63
+ || (
64
+ github.event_name == 'push'
65
+ && github.event.ref == 'refs/heads/master'
66
+ && github.repository_owner == 'ggerganov'
67
+ )
68
+ steps:
69
+ - name: Clone
70
+ id: checkout
71
+ uses: actions/checkout@v4
72
+ with:
73
+ fetch-depth: 0
74
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
75
+
76
+ - name: Install python env
77
+ id: pipenv
78
+ run: |
79
+ cd examples/server/bench
80
+ python3 -m venv venv
81
+ source venv/bin/activate
82
+ pip install -r requirements.txt
83
+
84
+ - name: Prometheus
85
+ id: install_prometheus
86
+ run: |
87
+ wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
88
+ tar xzf prometheus*.tar.gz --strip-components=1
89
+ ./prometheus --config.file=examples/server/bench/prometheus.yml &
90
+ while ! nc -z localhost 9090; do
91
+ sleep 0.1
92
+ done
93
+
94
+ - name: Set up Go
95
+ uses: actions/setup-go@v5
96
+ with:
97
+ go-version: '1.21'
98
+
99
+ - name: Install k6 and xk6-sse
100
+ id: k6_installation
101
+ run: |
102
+ cd examples/server/bench
103
+ go install go.k6.io/xk6/cmd/xk6@latest
104
+ xk6 build master \
105
+ --with github.com/phymbert/xk6-sse
106
+
107
+ - name: Build
108
+ id: cmake_build
109
+ run: |
110
+ set -eux
111
+ cmake -B build \
112
+ -DGGML_NATIVE=OFF \
113
+ -DLLAMA_BUILD_SERVER=ON \
114
+ -DLLAMA_CURL=ON \
115
+ -DLLAMA_CUBLAS=ON \
116
+ -DCUDAToolkit_ROOT=/usr/local/cuda \
117
+ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
118
+ -DCMAKE_CUDA_ARCHITECTURES=75 \
119
+ -DLLAMA_FATAL_WARNINGS=OFF \
120
+ -DLLAMA_ALL_WARNINGS=OFF \
121
+ -DCMAKE_BUILD_TYPE=Release;
122
+ cmake --build build --config Release -j $(nproc) --target llama-server
123
+
124
+ - name: Download the dataset
125
+ id: download_dataset
126
+ run: |
127
+ cd examples/server/bench
128
+ wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
129
+
130
+ - name: Server bench
131
+ id: server_bench
132
+ run: |
133
+ set -eux
134
+
135
+ cd examples/server/bench
136
+ source venv/bin/activate
137
+ python bench.py \
138
+ --runner-label ${{ env.RUNNER_LABEL }} \
139
+ --name ${{ github.job }} \
140
+ --branch ${{ github.head_ref || github.ref_name }} \
141
+ --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
142
+ --scenario script.js \
143
+ --duration ${{ github.event.inputs.duration || env.DURATION }} \
144
+ --hf-repo ggml-org/models \
145
+ --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
146
+ --model-path-prefix /models \
147
+ --parallel ${{ env.N_USERS }} \
148
+ -ngl 33 \
149
+ --batch-size 2048 \
150
+ --ubatch-size 256 \
151
+ --ctx-size 16384 \
152
+ --n-prompts 1000 \
153
+ --max-prompt-tokens 1024 \
154
+ --max-tokens 2048
155
+
156
+ cat results.github.env >> $GITHUB_ENV
157
+
158
+ # Remove dataset as we do not want it in the artefact
159
+ rm ShareGPT_V3_unfiltered_cleaned_split.json
160
+
161
+ - uses: actions/upload-artifact@v4
162
+ with:
163
+ name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
164
+ compression-level: 9
165
+ path: |
166
+ examples/server/bench/*.jpg
167
+ examples/server/bench/*.json
168
+ examples/server/bench/*.log
169
+
170
+ - name: Commit status
171
+ uses: Sibz/github-status-action@v1
172
+ with:
173
+ authToken: ${{secrets.GITHUB_TOKEN}}
174
+ sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
175
+ context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
176
+ description: |
177
+ ${{ env.BENCH_RESULTS }}
178
+ state: 'success'
179
+
180
+ - name: Upload benchmark images
181
+ uses: devicons/public-upload-to-imgur@v2.2.2
182
+ continue-on-error: true # Important as it looks unstable: 503
183
+ id: imgur_step
184
+ with:
185
+ client_id: ${{secrets.IMGUR_CLIENT_ID}}
186
+ path: |
187
+ examples/server/bench/prompt_tokens_seconds.jpg
188
+ examples/server/bench/predicted_tokens_seconds.jpg
189
+ examples/server/bench/kv_cache_usage_ratio.jpg
190
+ examples/server/bench/requests_processing.jpg
191
+
192
+ - name: Extract mermaid
193
+ id: set_mermaid
194
+ run: |
195
+ set -eux
196
+
197
+ cd examples/server/bench
198
+ PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
199
+ echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
200
+ echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
201
+ echo "EOF" >> $GITHUB_ENV
202
+
203
+ PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
204
+ echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
205
+ echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
206
+ echo "EOF" >> $GITHUB_ENV
207
+
208
+ KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
209
+ echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
210
+ echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
211
+ echo "EOF" >> $GITHUB_ENV
212
+
213
+ REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
214
+ echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
215
+ echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
216
+ echo "EOF" >> $GITHUB_ENV
217
+
218
+ - name: Extract image url
219
+ id: extract_image_url
220
+ continue-on-error: true
221
+ run: |
222
+ set -eux
223
+
224
+ echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
225
+ echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
226
+ echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
227
+ echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
228
+
229
+ - name: Comment PR
230
+ uses: mshick/add-pr-comment@v2
231
+ id: comment_pr
232
+ if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
233
+ with:
234
+ message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
235
+ message: |
236
+ <p align="center">
237
+
238
+ 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
239
+
240
+ </p>
241
+
242
+ <details>
243
+
244
+ <summary>Expand details for performance related PR only</summary>
245
+
246
+ - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
247
+ - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
248
+ - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
249
+ - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
250
+ - ${{ env.BENCH_GRAPH_XLABEL }}
251
+
252
+
253
+ <p align="center">
254
+
255
+ <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
256
+
257
+ <details>
258
+
259
+ <summary>More</summary>
260
+
261
+ ```mermaid
262
+ ${{ env.PROMPT_TOKENS_SECONDS }}
263
+ ```
264
+
265
+ </details>
266
+
267
+ <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
268
+
269
+ <details>
270
+ <summary>More</summary>
271
+
272
+ ```mermaid
273
+ ${{ env.PREDICTED_TOKENS_SECONDS }}
274
+ ```
275
+
276
+ </details>
277
+
278
+ </p>
279
+
280
+ <details>
281
+
282
+ <summary>Details</summary>
283
+
284
+ <p align="center">
285
+
286
+ <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
287
+
288
+ <details>
289
+ <summary>More</summary>
290
+
291
+ ```mermaid
292
+ ${{ env.KV_CACHE_USAGE_RATIO }}
293
+ ```
294
+
295
+ </details>
296
+
297
+ <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
298
+
299
+ <details>
300
+ <summary>More</summary>
301
+
302
+ ```mermaid
303
+ ${{ env.REQUESTS_PROCESSING }}
304
+ ```
305
+
306
+ </details>
307
+
308
+ </p>
309
+ </details>
310
+ </details>