@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. package/CMakeLists.txt +5 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +8 -1
  17. package/package.json +1 -1
  18. package/patches/llama.patch +12 -12
  19. package/src/DetokenizeWorker.cpp +1 -1
  20. package/src/LlamaContext.cpp +33 -1
  21. package/src/LlamaContext.h +1 -0
  22. package/src/LoadSessionWorker.cpp +1 -0
  23. package/src/llama.cpp/.github/workflows/bench.yml +310 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +1315 -0
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +116 -0
  27. package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
  28. package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
  29. package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
  30. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
  31. package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
  32. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
  33. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
  34. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
  35. package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
  36. package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
  37. package/src/llama.cpp/.github/workflows/server.yml +183 -0
  38. package/src/llama.cpp/CMakeLists.txt +91 -1245
  39. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
  40. package/src/llama.cpp/cmake/build-info.cmake +58 -0
  41. package/src/llama.cpp/cmake/git-vars.cmake +22 -0
  42. package/src/llama.cpp/common/CMakeLists.txt +4 -3
  43. package/src/llama.cpp/common/build-info.cpp.in +4 -0
  44. package/src/llama.cpp/common/common.cpp +1116 -877
  45. package/src/llama.cpp/common/common.h +191 -77
  46. package/src/llama.cpp/common/grammar-parser.cpp +118 -31
  47. package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
  48. package/src/llama.cpp/common/log.h +1 -1
  49. package/src/llama.cpp/common/ngram-cache.h +10 -3
  50. package/src/llama.cpp/common/sampling.cpp +19 -10
  51. package/src/llama.cpp/docs/build.md +353 -0
  52. package/src/llama.cpp/examples/CMakeLists.txt +22 -22
  53. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
  55. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/batched/batched.cpp +52 -55
  57. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
  59. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/chat-13B.bat +57 -0
  61. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
  63. package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
  64. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
  65. package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
  66. package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
  67. package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
  68. package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
  69. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
  70. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  71. package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
  72. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
  73. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
  74. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
  76. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
  77. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
  78. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
  80. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
  81. package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
  82. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
  83. package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
  84. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
  85. package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
  86. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
  87. package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
  88. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
  89. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  90. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
  91. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  92. package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
  93. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  94. package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
  95. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/infill/infill.cpp +38 -153
  97. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
  98. package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
  99. package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
  100. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
  101. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
  102. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
  103. package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
  104. package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
  105. package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
  106. package/src/llama.cpp/examples/llava/clip.cpp +23 -14
  107. package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
  108. package/src/llama.cpp/examples/llava/requirements.txt +3 -2
  109. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  110. package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
  111. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  112. package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
  113. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  114. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
  115. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  116. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  117. package/src/llama.cpp/examples/main/main.cpp +98 -75
  118. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
  119. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  120. package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
  121. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  122. package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
  123. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  124. package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
  125. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  126. package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
  127. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  129. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  130. package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
  131. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
  133. package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
  134. package/src/llama.cpp/examples/server/server.cpp +274 -671
  135. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  136. package/src/llama.cpp/examples/server/utils.hpp +28 -29
  137. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  138. package/src/llama.cpp/examples/simple/simple.cpp +21 -29
  139. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  140. package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
  141. package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
  142. package/src/llama.cpp/examples/sycl/build.sh +23 -0
  143. package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
  144. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
  145. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
  146. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  147. package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
  148. package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
  149. package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
  150. package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
  151. package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
  152. package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
  153. package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
  154. package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
  155. package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
  156. package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
  157. package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
  158. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
  159. package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
  160. package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
  161. package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
  162. package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
  163. package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
  164. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
  165. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
  169. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
  170. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
  171. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
  172. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
  173. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
  174. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
  175. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  176. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
  177. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
  178. package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
  179. package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
  180. package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
  181. package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
  182. package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
  183. package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
  184. package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
  185. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
  187. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
  188. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
  192. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
  195. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  197. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
  198. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
  199. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
  200. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
  201. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
  202. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
  203. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
  204. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
  205. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
  208. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
  209. package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
  210. package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
  211. package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
  212. package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
  213. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
  214. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
  215. package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
  216. package/src/llama.cpp/models/.editorconfig +1 -0
  217. package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  218. package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  219. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  220. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
  221. package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
  222. package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  223. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
  224. package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
  225. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  226. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
  227. package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
  228. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  229. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
  230. package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
  231. package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  232. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
  233. package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
  234. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  235. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
  236. package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
  237. package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  238. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  239. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  242. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
  243. package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
  244. package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  245. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
  246. package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
  247. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  248. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
  249. package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
  250. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  251. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
  252. package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
  253. package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  254. package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
  255. package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
  256. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  257. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
  258. package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
  259. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  260. package/src/llama.cpp/requirements/requirements-all.txt +12 -0
  261. package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
  262. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
  263. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
  264. package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
  265. package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
  266. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  267. package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
  268. package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
  269. package/src/llama.cpp/requirements.txt +5 -4
  270. package/src/llama.cpp/scripts/build-info.sh +30 -0
  271. package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
  272. package/src/llama.cpp/src/CMakeLists.txt +33 -0
  273. package/src/llama.cpp/src/llama-grammar.cpp +539 -0
  274. package/src/llama.cpp/src/llama-grammar.h +39 -0
  275. package/src/llama.cpp/src/llama-impl.h +26 -0
  276. package/src/llama.cpp/src/llama-sampling.cpp +635 -0
  277. package/src/llama.cpp/src/llama-sampling.h +56 -0
  278. package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
  279. package/src/llama.cpp/src/llama-vocab.h +130 -0
  280. package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
  281. package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
  282. package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
  283. package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
  284. package/src/llama.cpp/tests/CMakeLists.txt +19 -20
  285. package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
  286. package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
  287. package/src/llama.cpp/tests/test-double-float.cpp +2 -2
  288. package/src/llama.cpp/tests/test-grad0.cpp +2 -2
  289. package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
  290. package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
  291. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
  292. package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
  293. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
  294. package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
  295. package/src/llama.cpp/tests/test-rope.cpp +3 -4
  296. package/src/llama.cpp/tests/test-sampling.cpp +5 -5
  297. package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
  298. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
  299. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
  300. package/bin/darwin/arm64/default.metallib +0 -0
  301. package/bin/darwin/x64/default.metallib +0 -0
  302. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
  303. package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
  304. package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
  305. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
  306. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
  307. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
  308. package/src/llama.cpp/ggml-opencl.cpp +0 -2305
  309. package/src/llama.cpp/ggml-opencl.h +0 -36
  310. package/src/llama.cpp/ggml-sycl.cpp +0 -17340
  311. package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
  312. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
  313. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
  314. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
  315. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
  316. /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
  317. /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
  318. /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
  319. /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
  320. /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
@@ -23,6 +23,10 @@
23
23
  #include "ggml-cuda.h"
24
24
  #include "ggml-sycl.h"
25
25
 
26
+ #ifdef GGML_USE_CANN
27
+ #include "ggml-cann.h"
28
+ #endif
29
+
26
30
  // utils
27
31
  static uint64_t get_time_ns() {
28
32
  using clock = std::chrono::high_resolution_clock;
@@ -41,20 +45,6 @@ static std::string join(const std::vector<T> & values, const std::string & delim
41
45
  return str.str();
42
46
  }
43
47
 
44
- template<class T>
45
- static std::vector<T> split(const std::string & str, char delim) {
46
- std::vector<T> values;
47
- std::istringstream str_stream(str);
48
- std::string token;
49
- while (std::getline(str_stream, token, delim)) {
50
- T value;
51
- std::istringstream token_stream(token);
52
- token_stream >> value;
53
- values.push_back(value);
54
- }
55
- return values;
56
- }
57
-
58
48
  template<typename T, typename F>
59
49
  static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
60
50
  std::vector<std::string> str_values;
@@ -134,22 +124,51 @@ static std::string get_gpu_info() {
134
124
  id += "/";
135
125
  }
136
126
  }
127
+ #endif
128
+ #ifdef GGML_USE_CANN
129
+ uint32_t count = ggml_backend_cann_get_device_count();
130
+ for (uint32_t i = 0; i < count; i++) {
131
+ char buf[128];
132
+ ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
133
+ id += buf;
134
+ if (i < count - 1) {
135
+ id += "/";
136
+ }
137
+ }
137
138
  #endif
138
139
  // TODO: other backends
139
140
  return id;
140
141
  }
141
142
 
142
143
  // command line params
143
- enum output_formats {CSV, JSON, MARKDOWN, SQL};
144
+ enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
144
145
 
145
146
  static const char * output_format_str(output_formats format) {
146
147
  switch (format) {
148
+ case NONE: return "none";
147
149
  case CSV: return "csv";
148
150
  case JSON: return "json";
149
151
  case MARKDOWN: return "md";
150
152
  case SQL: return "sql";
151
- default: GGML_ASSERT(!"invalid output format");
153
+ default: GGML_ABORT("invalid output format");
154
+ }
155
+ }
156
+
157
+ static bool output_format_from_str(const std::string & s, output_formats & format) {
158
+ if (s == "none") {
159
+ format = NONE;
160
+ } else if (s == "csv") {
161
+ format = CSV;
162
+ } else if (s == "json") {
163
+ format = JSON;
164
+ } else if (s == "md") {
165
+ format = MARKDOWN;
166
+ } else if (s == "sql") {
167
+ format = SQL;
168
+ } else {
169
+ return false;
152
170
  }
171
+ return true;
153
172
  }
154
173
 
155
174
  static const char * split_mode_str(llama_split_mode mode) {
@@ -157,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) {
157
176
  case LLAMA_SPLIT_MODE_NONE: return "none";
158
177
  case LLAMA_SPLIT_MODE_LAYER: return "layer";
159
178
  case LLAMA_SPLIT_MODE_ROW: return "row";
160
- default: GGML_ASSERT(!"invalid split mode");
179
+ default: GGML_ABORT("invalid split mode");
161
180
  }
162
181
  }
163
182
 
@@ -178,6 +197,7 @@ struct cmd_params {
178
197
  std::vector<ggml_type> type_v;
179
198
  std::vector<int> n_threads;
180
199
  std::vector<int> n_gpu_layers;
200
+ std::vector<std::string> rpc_servers;
181
201
  std::vector<llama_split_mode> split_mode;
182
202
  std::vector<int> main_gpu;
183
203
  std::vector<bool> no_kv_offload;
@@ -189,30 +209,33 @@ struct cmd_params {
189
209
  int reps;
190
210
  bool verbose;
191
211
  output_formats output_format;
212
+ output_formats output_format_stderr;
192
213
  };
193
214
 
194
215
  static const cmd_params cmd_params_defaults = {
195
- /* model */ {"models/7B/ggml-model-q4_0.gguf"},
196
- /* n_prompt */ {512},
197
- /* n_gen */ {128},
198
- /* n_pg */ {},
199
- /* n_batch */ {2048},
200
- /* n_ubatch */ {512},
201
- /* type_k */ {GGML_TYPE_F16},
202
- /* type_v */ {GGML_TYPE_F16},
203
- /* n_threads */ {cpu_get_num_math()},
204
- /* n_gpu_layers */ {99},
205
- /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
206
- /* main_gpu */ {0},
207
- /* no_kv_offload */ {false},
208
- /* flash_attn */ {false},
209
- /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
210
- /* use_mmap */ {true},
211
- /* embeddings */ {false},
212
- /* numa */ GGML_NUMA_STRATEGY_DISABLED,
213
- /* reps */ 5,
214
- /* verbose */ false,
215
- /* output_format */ MARKDOWN
216
+ /* model */ {"models/7B/ggml-model-q4_0.gguf"},
217
+ /* n_prompt */ {512},
218
+ /* n_gen */ {128},
219
+ /* n_pg */ {},
220
+ /* n_batch */ {2048},
221
+ /* n_ubatch */ {512},
222
+ /* type_k */ {GGML_TYPE_F16},
223
+ /* type_v */ {GGML_TYPE_F16},
224
+ /* n_threads */ {cpu_get_num_math()},
225
+ /* n_gpu_layers */ {99},
226
+ /* rpc_servers */ {""},
227
+ /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
228
+ /* main_gpu */ {0},
229
+ /* no_kv_offload */ {false},
230
+ /* flash_attn */ {false},
231
+ /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
232
+ /* use_mmap */ {true},
233
+ /* embeddings */ {false},
234
+ /* numa */ GGML_NUMA_STRATEGY_DISABLED,
235
+ /* reps */ 5,
236
+ /* verbose */ false,
237
+ /* output_format */ MARKDOWN,
238
+ /* output_format_stderr */ NONE,
216
239
  };
217
240
 
218
241
  static void print_usage(int /* argc */, char ** argv) {
@@ -230,6 +253,7 @@ static void print_usage(int /* argc */, char ** argv) {
230
253
  printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
231
254
  printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
232
255
  printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
256
+ printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
233
257
  printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
234
258
  printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
235
259
  printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@@ -240,6 +264,7 @@ static void print_usage(int /* argc */, char ** argv) {
240
264
  printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
241
265
  printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
242
266
  printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
267
+ printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
243
268
  printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
244
269
  printf("\n");
245
270
  printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@@ -281,7 +306,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
281
306
 
282
307
  params.verbose = cmd_params_defaults.verbose;
283
308
  params.output_format = cmd_params_defaults.output_format;
309
+ params.output_format_stderr = cmd_params_defaults.output_format_stderr;
284
310
  params.reps = cmd_params_defaults.reps;
311
+ params.numa = cmd_params_defaults.numa;
285
312
 
286
313
  for (int i = 1; i < argc; i++) {
287
314
  arg = argv[i];
@@ -297,28 +324,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
297
324
  invalid_param = true;
298
325
  break;
299
326
  }
300
- auto p = split<std::string>(argv[i], split_delim);
327
+ auto p = string_split<std::string>(argv[i], split_delim);
301
328
  params.model.insert(params.model.end(), p.begin(), p.end());
302
329
  } else if (arg == "-p" || arg == "--n-prompt") {
303
330
  if (++i >= argc) {
304
331
  invalid_param = true;
305
332
  break;
306
333
  }
307
- auto p = split<int>(argv[i], split_delim);
334
+ auto p = string_split<int>(argv[i], split_delim);
308
335
  params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
309
336
  } else if (arg == "-n" || arg == "--n-gen") {
310
337
  if (++i >= argc) {
311
338
  invalid_param = true;
312
339
  break;
313
340
  }
314
- auto p = split<int>(argv[i], split_delim);
341
+ auto p = string_split<int>(argv[i], split_delim);
315
342
  params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
316
343
  } else if (arg == "-pg") {
317
344
  if (++i >= argc) {
318
345
  invalid_param = true;
319
346
  break;
320
347
  }
321
- auto p = split<std::string>(argv[i], ',');
348
+ auto p = string_split<std::string>(argv[i], ',');
322
349
  if (p.size() != 2) {
323
350
  invalid_param = true;
324
351
  break;
@@ -329,21 +356,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
329
356
  invalid_param = true;
330
357
  break;
331
358
  }
332
- auto p = split<int>(argv[i], split_delim);
359
+ auto p = string_split<int>(argv[i], split_delim);
333
360
  params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
334
361
  } else if (arg == "-ub" || arg == "--ubatch-size") {
335
362
  if (++i >= argc) {
336
363
  invalid_param = true;
337
364
  break;
338
365
  }
339
- auto p = split<int>(argv[i], split_delim);
366
+ auto p = string_split<int>(argv[i], split_delim);
340
367
  params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
341
368
  } else if (arg == "-ctk" || arg == "--cache-type-k") {
342
369
  if (++i >= argc) {
343
370
  invalid_param = true;
344
371
  break;
345
372
  }
346
- auto p = split<std::string>(argv[i], split_delim);
373
+ auto p = string_split<std::string>(argv[i], split_delim);
347
374
  std::vector<ggml_type> types;
348
375
  for (const auto & t : p) {
349
376
  ggml_type gt = ggml_type_from_name(t);
@@ -359,7 +386,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
359
386
  invalid_param = true;
360
387
  break;
361
388
  }
362
- auto p = split<std::string>(argv[i], split_delim);
389
+ auto p = string_split<std::string>(argv[i], split_delim);
363
390
  std::vector<ggml_type> types;
364
391
  for (const auto & t : p) {
365
392
  ggml_type gt = ggml_type_from_name(t);
@@ -375,21 +402,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
375
402
  invalid_param = true;
376
403
  break;
377
404
  }
378
- auto p = split<int>(argv[i], split_delim);
405
+ auto p = string_split<int>(argv[i], split_delim);
379
406
  params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
380
407
  } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
381
408
  if (++i >= argc) {
382
409
  invalid_param = true;
383
410
  break;
384
411
  }
385
- auto p = split<int>(argv[i], split_delim);
412
+ auto p = string_split<int>(argv[i], split_delim);
386
413
  params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
414
+ } else if (arg == "-rpc" || arg == "--rpc") {
415
+ if (++i >= argc) {
416
+ invalid_param = true;
417
+ break;
418
+ }
419
+ params.rpc_servers.push_back(argv[i]);
387
420
  } else if (arg == "-sm" || arg == "--split-mode") {
388
421
  if (++i >= argc) {
389
422
  invalid_param = true;
390
423
  break;
391
424
  }
392
- auto p = split<std::string>(argv[i], split_delim);
425
+ auto p = string_split<std::string>(argv[i], split_delim);
393
426
  std::vector<llama_split_mode> modes;
394
427
  for (const auto & m : p) {
395
428
  llama_split_mode mode;
@@ -411,13 +444,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
411
444
  invalid_param = true;
412
445
  break;
413
446
  }
414
- params.main_gpu = split<int>(argv[i], split_delim);
447
+ params.main_gpu = string_split<int>(argv[i], split_delim);
415
448
  } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
416
449
  if (++i >= argc) {
417
450
  invalid_param = true;
418
451
  break;
419
452
  }
420
- auto p = split<bool>(argv[i], split_delim);
453
+ auto p = string_split<bool>(argv[i], split_delim);
421
454
  params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
422
455
  } else if (arg == "--numa") {
423
456
  if (++i >= argc) {
@@ -435,28 +468,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
435
468
  invalid_param = true;
436
469
  break;
437
470
  }
438
- auto p = split<bool>(argv[i], split_delim);
471
+ auto p = string_split<bool>(argv[i], split_delim);
439
472
  params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
440
473
  } else if (arg == "-mmp" || arg == "--mmap") {
441
474
  if (++i >= argc) {
442
475
  invalid_param = true;
443
476
  break;
444
477
  }
445
- auto p = split<bool>(argv[i], split_delim);
478
+ auto p = string_split<bool>(argv[i], split_delim);
446
479
  params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
447
480
  } else if (arg == "-embd" || arg == "--embeddings") {
448
481
  if (++i >= argc) {
449
482
  invalid_param = true;
450
483
  break;
451
484
  }
452
- auto p = split<bool>(argv[i], split_delim);
485
+ auto p = string_split<bool>(argv[i], split_delim);
453
486
  params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
454
487
  } else if (arg == "-ts" || arg == "--tensor-split") {
455
488
  if (++i >= argc) {
456
489
  invalid_param = true;
457
490
  break;
458
491
  }
459
- for (auto ts : split<std::string>(argv[i], split_delim)) {
492
+ for (auto ts : string_split<std::string>(argv[i], split_delim)) {
460
493
  // split string by ; and /
461
494
  const std::regex regex{R"([;/]+)"};
462
495
  std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
@@ -484,18 +517,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
484
517
  invalid_param = true;
485
518
  break;
486
519
  }
487
- if (argv[i] == std::string("csv")) {
488
- params.output_format = CSV;
489
- } else if (argv[i] == std::string("json")) {
490
- params.output_format = JSON;
491
- } else if (argv[i] == std::string("md")) {
492
- params.output_format = MARKDOWN;
493
- } else if (argv[i] == std::string("sql")) {
494
- params.output_format = SQL;
495
- } else {
520
+ invalid_param = !output_format_from_str(argv[i], params.output_format);
521
+ } else if (arg == "-oe" || arg == "--output-err") {
522
+ if (++i >= argc) {
496
523
  invalid_param = true;
497
524
  break;
498
525
  }
526
+ invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
499
527
  } else if (arg == "-v" || arg == "--verbose") {
500
528
  params.verbose = true;
501
529
  } else {
@@ -519,6 +547,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
519
547
  if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
520
548
  if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
521
549
  if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
550
+ if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; }
522
551
  if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
523
552
  if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
524
553
  if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
@@ -541,6 +570,7 @@ struct cmd_params_instance {
541
570
  ggml_type type_v;
542
571
  int n_threads;
543
572
  int n_gpu_layers;
573
+ std::string rpc_servers;
544
574
  llama_split_mode split_mode;
545
575
  int main_gpu;
546
576
  bool no_kv_offload;
@@ -553,6 +583,9 @@ struct cmd_params_instance {
553
583
  llama_model_params mparams = llama_model_default_params();
554
584
 
555
585
  mparams.n_gpu_layers = n_gpu_layers;
586
+ if (!rpc_servers.empty()) {
587
+ mparams.rpc_servers = rpc_servers.c_str();
588
+ }
556
589
  mparams.split_mode = split_mode;
557
590
  mparams.main_gpu = main_gpu;
558
591
  mparams.tensor_split = tensor_split.data();
@@ -564,6 +597,7 @@ struct cmd_params_instance {
564
597
  bool equal_mparams(const cmd_params_instance & other) const {
565
598
  return model == other.model &&
566
599
  n_gpu_layers == other.n_gpu_layers &&
600
+ rpc_servers == other.rpc_servers &&
567
601
  split_mode == other.split_mode &&
568
602
  main_gpu == other.main_gpu &&
569
603
  use_mmap == other.use_mmap &&
@@ -592,6 +626,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
592
626
  // this ordering minimizes the number of times that each model needs to be reloaded
593
627
  for (const auto & m : params.model)
594
628
  for (const auto & nl : params.n_gpu_layers)
629
+ for (const auto & rpc : params.rpc_servers)
595
630
  for (const auto & sm : params.split_mode)
596
631
  for (const auto & mg : params.main_gpu)
597
632
  for (const auto & ts : params.tensor_split)
@@ -618,6 +653,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
618
653
  /* .type_v = */ tv,
619
654
  /* .n_threads = */ nt,
620
655
  /* .n_gpu_layers = */ nl,
656
+ /* .rpc_servers = */ rpc,
621
657
  /* .split_mode = */ sm,
622
658
  /* .main_gpu = */ mg,
623
659
  /* .no_kv_offload= */ nkvo,
@@ -643,6 +679,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
643
679
  /* .type_v = */ tv,
644
680
  /* .n_threads = */ nt,
645
681
  /* .n_gpu_layers = */ nl,
682
+ /* .rpc_servers = */ rpc,
646
683
  /* .split_mode = */ sm,
647
684
  /* .main_gpu = */ mg,
648
685
  /* .no_kv_offload= */ nkvo,
@@ -668,6 +705,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
668
705
  /* .type_v = */ tv,
669
706
  /* .n_threads = */ nt,
670
707
  /* .n_gpu_layers = */ nl,
708
+ /* .rpc_servers = */ rpc,
671
709
  /* .split_mode = */ sm,
672
710
  /* .main_gpu = */ mg,
673
711
  /* .no_kv_offload= */ nkvo,
@@ -687,7 +725,6 @@ struct test {
687
725
  static const std::string build_commit;
688
726
  static const int build_number;
689
727
  static const bool cuda;
690
- static const bool opencl;
691
728
  static const bool vulkan;
692
729
  static const bool kompute;
693
730
  static const bool metal;
@@ -703,6 +740,7 @@ struct test {
703
740
  int n_batch;
704
741
  int n_ubatch;
705
742
  int n_threads;
743
+ bool has_rpc;
706
744
  ggml_type type_k;
707
745
  ggml_type type_v;
708
746
  int n_gpu_layers;
@@ -728,6 +766,7 @@ struct test {
728
766
  n_batch = inst.n_batch;
729
767
  n_ubatch = inst.n_ubatch;
730
768
  n_threads = inst.n_threads;
769
+ has_rpc = !inst.rpc_servers.empty();
731
770
  type_k = inst.type_k;
732
771
  type_v = inst.type_v;
733
772
  n_gpu_layers = inst.n_gpu_layers;
@@ -775,9 +814,6 @@ struct test {
775
814
  if (cuda) {
776
815
  return GGML_CUDA_NAME;
777
816
  }
778
- if (opencl) {
779
- return "OpenCL";
780
- }
781
817
  if (vulkan) {
782
818
  return "Vulkan";
783
819
  }
@@ -803,7 +839,7 @@ struct test {
803
839
  static const std::vector<std::string> & get_fields() {
804
840
  static const std::vector<std::string> fields = {
805
841
  "build_commit", "build_number",
806
- "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
842
+ "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
807
843
  "cpu_info", "gpu_info",
808
844
  "model_filename", "model_type", "model_size", "model_n_params",
809
845
  "n_batch", "n_ubatch",
@@ -829,7 +865,7 @@ struct test {
829
865
  field == "avg_ns" || field == "stddev_ns") {
830
866
  return INT;
831
867
  }
832
- if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
868
+ if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
833
869
  field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
834
870
  field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
835
871
  return BOOL;
@@ -858,8 +894,8 @@ struct test {
858
894
  }
859
895
  std::vector<std::string> values = {
860
896
  build_commit, std::to_string(build_number),
861
- std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
862
- std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
897
+ std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
898
+ std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
863
899
  cpu_info, gpu_info,
864
900
  model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
865
901
  std::to_string(n_batch), std::to_string(n_ubatch),
@@ -887,7 +923,6 @@ struct test {
887
923
  const std::string test::build_commit = LLAMA_COMMIT;
888
924
  const int test::build_number = LLAMA_BUILD_NUMBER;
889
925
  const bool test::cuda = !!ggml_cpu_has_cuda();
890
- const bool test::opencl = !!ggml_cpu_has_clblast();
891
926
  const bool test::vulkan = !!ggml_cpu_has_vulkan();
892
927
  const bool test::kompute = !!ggml_cpu_has_kompute();
893
928
  const bool test::metal = !!ggml_cpu_has_metal();
@@ -1011,6 +1046,27 @@ struct markdown_printer : public printer {
1011
1046
  if (field == "n_gpu_layers") {
1012
1047
  return 3;
1013
1048
  }
1049
+ if (field == "n_threads") {
1050
+ return 7;
1051
+ }
1052
+ if (field == "n_batch") {
1053
+ return 7;
1054
+ }
1055
+ if (field == "n_ubatch") {
1056
+ return 8;
1057
+ }
1058
+ if (field == "type_k" || field == "type_v") {
1059
+ return 6;
1060
+ }
1061
+ if (field == "split_mode") {
1062
+ return 5;
1063
+ }
1064
+ if (field == "flash_attn") {
1065
+ return 2;
1066
+ }
1067
+ if (field == "use_mmap") {
1068
+ return 4;
1069
+ }
1014
1070
  if (field == "test") {
1015
1071
  return 13;
1016
1072
  }
@@ -1138,6 +1194,9 @@ struct markdown_printer : public printer {
1138
1194
  value = buf;
1139
1195
  } else if (field == "backend") {
1140
1196
  value = test::get_backend();
1197
+ if (t.has_rpc) {
1198
+ value += "+RPC";
1199
+ }
1141
1200
  } else if (field == "test") {
1142
1201
  if (t.n_prompt > 0 && t.n_gen == 0) {
1143
1202
  snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
@@ -1254,6 +1313,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
1254
1313
  (void) user_data;
1255
1314
  }
1256
1315
 
1316
+ static std::unique_ptr<printer> create_printer(output_formats format) {
1317
+ switch (format) {
1318
+ case NONE:
1319
+ return nullptr;
1320
+ case CSV:
1321
+ return std::unique_ptr<printer>(new csv_printer());
1322
+ case JSON:
1323
+ return std::unique_ptr<printer>(new json_printer());
1324
+ case MARKDOWN:
1325
+ return std::unique_ptr<printer>(new markdown_printer());
1326
+ case SQL:
1327
+ return std::unique_ptr<printer>(new sql_printer());
1328
+ }
1329
+ GGML_ABORT("fatal error");
1330
+ }
1331
+
1257
1332
  int main(int argc, char ** argv) {
1258
1333
  // try to set locale for unicode characters in markdown
1259
1334
  setlocale(LC_CTYPE, ".UTF-8");
@@ -1280,26 +1355,18 @@ int main(int argc, char ** argv) {
1280
1355
  llama_numa_init(params.numa);
1281
1356
 
1282
1357
  // initialize printer
1283
- std::unique_ptr<printer> p;
1284
- switch (params.output_format) {
1285
- case CSV:
1286
- p.reset(new csv_printer());
1287
- break;
1288
- case JSON:
1289
- p.reset(new json_printer());
1290
- break;
1291
- case MARKDOWN:
1292
- p.reset(new markdown_printer());
1293
- break;
1294
- case SQL:
1295
- p.reset(new sql_printer());
1296
- break;
1297
- default:
1298
- assert(false);
1299
- exit(1);
1358
+ std::unique_ptr<printer> p = create_printer(params.output_format);
1359
+ std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
1360
+
1361
+ if (p) {
1362
+ p->fout = stdout;
1363
+ p->print_header(params);
1364
+ }
1365
+
1366
+ if (p_err) {
1367
+ p_err->fout = stderr;
1368
+ p_err->print_header(params);
1300
1369
  }
1301
- p->fout = stdout;
1302
- p->print_header(params);
1303
1370
 
1304
1371
  std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
1305
1372
 
@@ -1357,7 +1424,15 @@ int main(int argc, char ** argv) {
1357
1424
  t.samples_ns.push_back(t_ns);
1358
1425
  }
1359
1426
 
1360
- p->print_test(t);
1427
+ if (p) {
1428
+ p->print_test(t);
1429
+ fflush(p->fout);
1430
+ }
1431
+
1432
+ if (p_err) {
1433
+ p_err->print_test(t);
1434
+ fflush(p_err->fout);
1435
+ }
1361
1436
 
1362
1437
  llama_print_timings(ctx);
1363
1438
 
@@ -1366,7 +1441,13 @@ int main(int argc, char ** argv) {
1366
1441
 
1367
1442
  llama_free_model(lmodel);
1368
1443
 
1369
- p->print_footer();
1444
+ if (p) {
1445
+ p->print_footer();
1446
+ }
1447
+
1448
+ if (p_err) {
1449
+ p_err->print_footer();
1450
+ }
1370
1451
 
1371
1452
  llama_backend_free();
1372
1453
 
@@ -0,0 +1,65 @@
1
+ plugins {
2
+ id("com.android.application")
3
+ id("org.jetbrains.kotlin.android")
4
+ }
5
+
6
+ android {
7
+ namespace = "com.example.llama"
8
+ compileSdk = 34
9
+
10
+ defaultConfig {
11
+ applicationId = "com.example.llama"
12
+ minSdk = 33
13
+ targetSdk = 34
14
+ versionCode = 1
15
+ versionName = "1.0"
16
+
17
+ testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
18
+ vectorDrawables {
19
+ useSupportLibrary = true
20
+ }
21
+ }
22
+
23
+ buildTypes {
24
+ release {
25
+ isMinifyEnabled = false
26
+ proguardFiles(
27
+ getDefaultProguardFile("proguard-android-optimize.txt"),
28
+ "proguard-rules.pro"
29
+ )
30
+ }
31
+ }
32
+ compileOptions {
33
+ sourceCompatibility = JavaVersion.VERSION_1_8
34
+ targetCompatibility = JavaVersion.VERSION_1_8
35
+ }
36
+ kotlinOptions {
37
+ jvmTarget = "1.8"
38
+ }
39
+ buildFeatures {
40
+ compose = true
41
+ }
42
+ composeOptions {
43
+ kotlinCompilerExtensionVersion = "1.5.1"
44
+ }
45
+ }
46
+
47
+ dependencies {
48
+
49
+ implementation("androidx.core:core-ktx:1.12.0")
50
+ implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
51
+ implementation("androidx.activity:activity-compose:1.8.2")
52
+ implementation(platform("androidx.compose:compose-bom:2023.08.00"))
53
+ implementation("androidx.compose.ui:ui")
54
+ implementation("androidx.compose.ui:ui-graphics")
55
+ implementation("androidx.compose.ui:ui-tooling-preview")
56
+ implementation("androidx.compose.material3:material3")
57
+ implementation(project(":llama"))
58
+ testImplementation("junit:junit:4.13.2")
59
+ androidTestImplementation("androidx.test.ext:junit:1.1.5")
60
+ androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
61
+ androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
62
+ androidTestImplementation("androidx.compose.ui:ui-test-junit4")
63
+ debugImplementation("androidx.compose.ui:ui-tooling")
64
+ debugImplementation("androidx.compose.ui:ui-test-manifest")
65
+ }
@@ -0,0 +1,6 @@
1
+ // Top-level build file where you can add configuration options common to all sub-projects/modules.
2
+ plugins {
3
+ id("com.android.application") version "8.2.0" apply false
4
+ id("org.jetbrains.kotlin.android") version "1.9.0" apply false
5
+ id("com.android.library") version "8.2.0" apply false
6
+ }