@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,5 +1,5 @@
1
1
  set(TARGET llama-simple)
2
2
  add_executable(${TARGET} simple.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
4
+ target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -1,50 +1,116 @@
1
- #include "arg.h"
2
- #include "common.h"
3
- #include "log.h"
4
1
  #include "llama.h"
5
-
2
+ #include <cstdio>
3
+ #include <cstring>
4
+ #include <string>
6
5
  #include <vector>
7
6
 
8
7
  static void print_usage(int, char ** argv) {
9
- LOG("\nexample usage:\n");
10
- LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
11
- LOG("\n");
8
+ printf("\nexample usage:\n");
9
+ printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
10
+ printf("\n");
12
11
  }
13
12
 
14
13
  int main(int argc, char ** argv) {
15
- gpt_params params;
16
-
17
- params.prompt = "Hello my name is";
18
- params.n_predict = 32;
19
-
20
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
21
- return 1;
14
+ // path to the model gguf file
15
+ std::string model_path;
16
+ // prompt to generate text from
17
+ std::string prompt = "Hello my name is";
18
+ // number of layers to offload to the GPU
19
+ int ngl = 99;
20
+ // number of tokens to predict
21
+ int n_predict = 32;
22
+
23
+ // parse command line arguments
24
+
25
+ {
26
+ int i = 1;
27
+ for (; i < argc; i++) {
28
+ if (strcmp(argv[i], "-m") == 0) {
29
+ if (i + 1 < argc) {
30
+ model_path = argv[++i];
31
+ } else {
32
+ print_usage(argc, argv);
33
+ return 1;
34
+ }
35
+ } else if (strcmp(argv[i], "-n") == 0) {
36
+ if (i + 1 < argc) {
37
+ try {
38
+ n_predict = std::stoi(argv[++i]);
39
+ } catch (...) {
40
+ print_usage(argc, argv);
41
+ return 1;
42
+ }
43
+ } else {
44
+ print_usage(argc, argv);
45
+ return 1;
46
+ }
47
+ } else if (strcmp(argv[i], "-ngl") == 0) {
48
+ if (i + 1 < argc) {
49
+ try {
50
+ ngl = std::stoi(argv[++i]);
51
+ } catch (...) {
52
+ print_usage(argc, argv);
53
+ return 1;
54
+ }
55
+ } else {
56
+ print_usage(argc, argv);
57
+ return 1;
58
+ }
59
+ } else {
60
+ // prompt starts here
61
+ break;
62
+ }
63
+ }
64
+ if (model_path.empty()) {
65
+ print_usage(argc, argv);
66
+ return 1;
67
+ }
68
+ if (i < argc) {
69
+ prompt = argv[i++];
70
+ for (; i < argc; i++) {
71
+ prompt += " ";
72
+ prompt += argv[i];
73
+ }
74
+ }
22
75
  }
23
76
 
24
- gpt_init();
25
-
26
- // total length of the sequence including the prompt
27
- const int n_predict = params.n_predict;
77
+ // load dynamic backends
28
78
 
29
- // init LLM
30
-
31
- llama_backend_init();
32
- llama_numa_init(params.numa);
79
+ ggml_backend_load_all();
33
80
 
34
81
  // initialize the model
35
82
 
36
- llama_model_params model_params = llama_model_params_from_gpt_params(params);
83
+ llama_model_params model_params = llama_model_default_params();
84
+ model_params.n_gpu_layers = ngl;
37
85
 
38
- llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
86
+ llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
39
87
 
40
88
  if (model == NULL) {
41
89
  fprintf(stderr , "%s: error: unable to load model\n" , __func__);
42
90
  return 1;
43
91
  }
44
92
 
93
+ // tokenize the prompt
94
+
95
+ // find the number of tokens in the prompt
96
+ const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
97
+
98
+ // allocate space for the tokens and tokenize the prompt
99
+ std::vector<llama_token> prompt_tokens(n_prompt);
100
+ if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
101
+ fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
102
+ return 1;
103
+ }
104
+
45
105
  // initialize the context
46
106
 
47
- llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
107
+ llama_context_params ctx_params = llama_context_default_params();
108
+ // n_ctx is the context size
109
+ ctx_params.n_ctx = n_prompt + n_predict - 1;
110
+ // n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
111
+ ctx_params.n_batch = n_prompt;
112
+ // enable performance counters
113
+ ctx_params.no_perf = false;
48
114
 
49
115
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
50
116
 
@@ -53,117 +119,87 @@ int main(int argc, char ** argv) {
53
119
  return 1;
54
120
  }
55
121
 
56
- auto sparams = llama_sampler_chain_default_params();
122
+ // initialize the sampler
57
123
 
124
+ auto sparams = llama_sampler_chain_default_params();
58
125
  sparams.no_perf = false;
59
-
60
126
  llama_sampler * smpl = llama_sampler_chain_init(sparams);
61
127
 
62
128
  llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
63
129
 
64
- // tokenize the prompt
65
-
66
- std::vector<llama_token> tokens_list;
67
- tokens_list = ::llama_tokenize(ctx, params.prompt, true);
68
-
69
- const int n_ctx = llama_n_ctx(ctx);
70
- const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
71
-
72
- LOG("\n");
73
- LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
74
-
75
- // make sure the KV cache is big enough to hold all the prompt and generated tokens
76
- if (n_kv_req > n_ctx) {
77
- LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
78
- LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__);
79
- return 1;
80
- }
81
-
82
130
  // print the prompt token-by-token
83
131
 
84
- LOG("\n");
85
-
86
- for (auto id : tokens_list) {
87
- LOG("%s", llama_token_to_piece(ctx, id).c_str());
88
- }
89
-
90
- // create a llama_batch with size 512
91
- // we use this object to submit token data for decoding
92
-
93
- llama_batch batch = llama_batch_init(512, 0, 1);
94
-
95
- // evaluate the initial prompt
96
- for (size_t i = 0; i < tokens_list.size(); i++) {
97
- llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
132
+ for (auto id : prompt_tokens) {
133
+ char buf[128];
134
+ int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
135
+ if (n < 0) {
136
+ fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
137
+ return 1;
138
+ }
139
+ std::string s(buf, n);
140
+ printf("%s", s.c_str());
98
141
  }
99
142
 
100
- // llama_decode will output logits only for the last token of the prompt
101
- batch.logits[batch.n_tokens - 1] = true;
143
+ // prepare a batch for the prompt
102
144
 
103
- if (llama_decode(ctx, batch) != 0) {
104
- LOG("%s: llama_decode() failed\n", __func__);
105
- return 1;
106
- }
145
+ llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
107
146
 
108
147
  // main loop
109
148
 
110
- int n_cur = batch.n_tokens;
149
+ const auto t_main_start = ggml_time_us();
111
150
  int n_decode = 0;
151
+ llama_token new_token_id;
112
152
 
113
- const auto t_main_start = ggml_time_us();
153
+ for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
154
+ // evaluate the current batch with the transformer model
155
+ if (llama_decode(ctx, batch)) {
156
+ fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
157
+ return 1;
158
+ }
159
+
160
+ n_pos += batch.n_tokens;
114
161
 
115
- while (n_cur <= n_predict) {
116
162
  // sample the next token
117
163
  {
118
- const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
164
+ new_token_id = llama_sampler_sample(smpl, ctx, -1);
119
165
 
120
166
  // is it an end of generation?
121
- if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
122
- LOG("\n");
123
-
167
+ if (llama_token_is_eog(model, new_token_id)) {
124
168
  break;
125
169
  }
126
170
 
127
- LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
171
+ char buf[128];
172
+ int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
173
+ if (n < 0) {
174
+ fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
175
+ return 1;
176
+ }
177
+ std::string s(buf, n);
178
+ printf("%s", s.c_str());
128
179
  fflush(stdout);
129
180
 
130
- // prepare the next batch
131
- llama_batch_clear(batch);
132
-
133
- // push this new token for next evaluation
134
- llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
181
+ // prepare the next batch with the sampled token
182
+ batch = llama_batch_get_one(&new_token_id, 1);
135
183
 
136
184
  n_decode += 1;
137
185
  }
138
-
139
- n_cur += 1;
140
-
141
- // evaluate the current batch with the transformer model
142
- if (llama_decode(ctx, batch)) {
143
- LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
144
- return 1;
145
- }
146
186
  }
147
187
 
148
- LOG("\n");
188
+ printf("\n");
149
189
 
150
190
  const auto t_main_end = ggml_time_us();
151
191
 
152
- LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
192
+ fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
153
193
  __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
154
194
 
155
- LOG("\n");
195
+ fprintf(stderr, "\n");
156
196
  llama_perf_sampler_print(smpl);
157
197
  llama_perf_context_print(ctx);
198
+ fprintf(stderr, "\n");
158
199
 
159
- LOG("\n");
160
-
161
- llama_batch_free(batch);
162
200
  llama_sampler_free(smpl);
163
201
  llama_free(ctx);
164
202
  llama_free_model(model);
165
203
 
166
- llama_backend_free();
167
-
168
204
  return 0;
169
205
  }
@@ -0,0 +1,5 @@
1
+ set(TARGET llama-simple-chat)
2
+ add_executable(${TARGET} simple-chat.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -0,0 +1,200 @@
1
+ #include "llama.h"
2
+ #include <cstdio>
3
+ #include <cstring>
4
+ #include <iostream>
5
+ #include <string>
6
+ #include <vector>
7
+
8
+ static void print_usage(int, char ** argv) {
9
+ printf("\nexample usage:\n");
10
+ printf("\n %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
11
+ printf("\n");
12
+ }
13
+
14
+ int main(int argc, char ** argv) {
15
+ std::string model_path;
16
+ int ngl = 99;
17
+ int n_ctx = 2048;
18
+
19
+ // parse command line arguments
20
+ for (int i = 1; i < argc; i++) {
21
+ try {
22
+ if (strcmp(argv[i], "-m") == 0) {
23
+ if (i + 1 < argc) {
24
+ model_path = argv[++i];
25
+ } else {
26
+ print_usage(argc, argv);
27
+ return 1;
28
+ }
29
+ } else if (strcmp(argv[i], "-c") == 0) {
30
+ if (i + 1 < argc) {
31
+ n_ctx = std::stoi(argv[++i]);
32
+ } else {
33
+ print_usage(argc, argv);
34
+ return 1;
35
+ }
36
+ } else if (strcmp(argv[i], "-ngl") == 0) {
37
+ if (i + 1 < argc) {
38
+ ngl = std::stoi(argv[++i]);
39
+ } else {
40
+ print_usage(argc, argv);
41
+ return 1;
42
+ }
43
+ } else {
44
+ print_usage(argc, argv);
45
+ return 1;
46
+ }
47
+ } catch (std::exception & e) {
48
+ fprintf(stderr, "error: %s\n", e.what());
49
+ print_usage(argc, argv);
50
+ return 1;
51
+ }
52
+ }
53
+ if (model_path.empty()) {
54
+ print_usage(argc, argv);
55
+ return 1;
56
+ }
57
+
58
+ // only print errors
59
+ llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
60
+ if (level >= GGML_LOG_LEVEL_ERROR) {
61
+ fprintf(stderr, "%s", text);
62
+ }
63
+ }, nullptr);
64
+
65
+ // load dynamic backends
66
+ ggml_backend_load_all();
67
+
68
+ // initialize the model
69
+ llama_model_params model_params = llama_model_default_params();
70
+ model_params.n_gpu_layers = ngl;
71
+
72
+ llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
73
+ if (!model) {
74
+ fprintf(stderr , "%s: error: unable to load model\n" , __func__);
75
+ return 1;
76
+ }
77
+
78
+ // initialize the context
79
+ llama_context_params ctx_params = llama_context_default_params();
80
+ ctx_params.n_ctx = n_ctx;
81
+ ctx_params.n_batch = n_ctx;
82
+
83
+ llama_context * ctx = llama_new_context_with_model(model, ctx_params);
84
+ if (!ctx) {
85
+ fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
86
+ return 1;
87
+ }
88
+
89
+ // initialize the sampler
90
+ llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
91
+ llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
92
+ llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
93
+ llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
94
+
95
+ // helper function to evaluate a prompt and generate a response
96
+ auto generate = [&](const std::string & prompt) {
97
+ std::string response;
98
+
99
+ // tokenize the prompt
100
+ const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
101
+ std::vector<llama_token> prompt_tokens(n_prompt_tokens);
102
+ if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
103
+ GGML_ABORT("failed to tokenize the prompt\n");
104
+ }
105
+
106
+ // prepare a batch for the prompt
107
+ llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
108
+ llama_token new_token_id;
109
+ while (true) {
110
+ // check if we have enough space in the context to evaluate this batch
111
+ int n_ctx = llama_n_ctx(ctx);
112
+ int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
113
+ if (n_ctx_used + batch.n_tokens > n_ctx) {
114
+ printf("\033[0m\n");
115
+ fprintf(stderr, "context size exceeded\n");
116
+ exit(0);
117
+ }
118
+
119
+ if (llama_decode(ctx, batch)) {
120
+ GGML_ABORT("failed to decode\n");
121
+ }
122
+
123
+ // sample the next token
124
+ new_token_id = llama_sampler_sample(smpl, ctx, -1);
125
+
126
+ // is it an end of generation?
127
+ if (llama_token_is_eog(model, new_token_id)) {
128
+ break;
129
+ }
130
+
131
+ // convert the token to a string, print it and add it to the response
132
+ char buf[256];
133
+ int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
134
+ if (n < 0) {
135
+ GGML_ABORT("failed to convert token to piece\n");
136
+ }
137
+ std::string piece(buf, n);
138
+ printf("%s", piece.c_str());
139
+ fflush(stdout);
140
+ response += piece;
141
+
142
+ // prepare the next batch with the sampled token
143
+ batch = llama_batch_get_one(&new_token_id, 1);
144
+ }
145
+
146
+ return response;
147
+ };
148
+
149
+ std::vector<llama_chat_message> messages;
150
+ std::vector<char> formatted(llama_n_ctx(ctx));
151
+ int prev_len = 0;
152
+ while (true) {
153
+ // get user input
154
+ printf("\033[32m> \033[0m");
155
+ std::string user;
156
+ std::getline(std::cin, user);
157
+
158
+ if (user.empty()) {
159
+ break;
160
+ }
161
+
162
+ // add the user input to the message list and format it
163
+ messages.push_back({"user", strdup(user.c_str())});
164
+ int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
165
+ if (new_len > (int)formatted.size()) {
166
+ formatted.resize(new_len);
167
+ new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
168
+ }
169
+ if (new_len < 0) {
170
+ fprintf(stderr, "failed to apply the chat template\n");
171
+ return 1;
172
+ }
173
+
174
+ // remove previous messages to obtain the prompt to generate the response
175
+ std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
176
+
177
+ // generate a response
178
+ printf("\033[33m");
179
+ std::string response = generate(prompt);
180
+ printf("\n\033[0m");
181
+
182
+ // add the response to the messages
183
+ messages.push_back({"assistant", strdup(response.c_str())});
184
+ prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
185
+ if (prev_len < 0) {
186
+ fprintf(stderr, "failed to apply the chat template\n");
187
+ return 1;
188
+ }
189
+ }
190
+
191
+ // free resources
192
+ for (auto & msg : messages) {
193
+ free(const_cast<char *>(msg.content));
194
+ }
195
+ llama_sampler_free(smpl);
196
+ llama_free(ctx);
197
+ llama_free_model(model);
198
+
199
+ return 0;
200
+ }
@@ -2,4 +2,4 @@ set(TARGET llama-speculative)
2
2
  add_executable(${TARGET} speculative.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)