@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -37,13 +37,13 @@ struct Stats {
37
37
  class IMatrixCollector {
38
38
  public:
39
39
  IMatrixCollector() = default;
40
- void set_params(gpt_params params) { m_params = std::move(params); }
40
+ void set_params(common_params params) { m_params = std::move(params); }
41
41
  bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
42
42
  void save_imatrix(int ncall = -1) const;
43
43
  bool load_imatrix(const char * file_name);
44
44
  private:
45
45
  std::unordered_map<std::string, Stats> m_stats;
46
- gpt_params m_params;
46
+ common_params m_params;
47
47
  std::mutex m_mutex;
48
48
  int m_last_call = 0;
49
49
  std::vector<float> m_src1_data;
@@ -428,7 +428,7 @@ static void process_logits(
428
428
  }
429
429
  }
430
430
 
431
- static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
431
+ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
432
432
  const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
433
433
  GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
434
434
  const int n_ctx = llama_n_ctx(ctx);
@@ -436,7 +436,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
436
436
  auto tim1 = std::chrono::high_resolution_clock::now();
437
437
  LOG_INF("%s: tokenizing the input ..\n", __func__);
438
438
 
439
- std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
439
+ std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
440
440
 
441
441
  auto tim2 = std::chrono::high_resolution_clock::now();
442
442
  LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -496,6 +496,8 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
496
496
  // clear the KV cache
497
497
  llama_kv_cache_clear(ctx);
498
498
 
499
+ llama_batch batch = llama_batch_init(n_batch, 0, 1);
500
+
499
501
  for (int j = 0; j < num_batches; ++j) {
500
502
  const int batch_start = start + j * n_batch;
501
503
  const int batch_size = std::min(end - batch_start, n_batch);
@@ -508,9 +510,14 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
508
510
  tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
509
511
  }
510
512
 
511
- // TODO: use batch.logits to save computations instead of relying on logits_all == true
512
- if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
513
+ common_batch_clear(batch);
514
+ for (int i = 0; i < batch_size; i++) {
515
+ common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
516
+ }
517
+
518
+ if (llama_decode(ctx, batch)) {
513
519
  LOG_ERR("%s : failed to eval\n", __func__);
520
+ llama_batch_free(batch);
514
521
  return false;
515
522
  }
516
523
 
@@ -523,6 +530,8 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
523
530
  }
524
531
  }
525
532
 
533
+ llama_batch_free(batch);
534
+
526
535
  const auto t_end = std::chrono::high_resolution_clock::now();
527
536
 
528
537
  if (i == 0) {
@@ -568,17 +577,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
568
577
  }
569
578
 
570
579
  int main(int argc, char ** argv) {
571
- gpt_params params;
580
+ common_params params;
572
581
 
573
582
  params.n_ctx = 512;
574
583
  params.logits_all = true;
575
584
  params.escape = false;
576
585
 
577
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
586
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
578
587
  return 1;
579
588
  }
580
589
 
581
- gpt_init();
590
+ common_init();
582
591
 
583
592
  params.n_batch = std::min(params.n_batch, params.n_ctx);
584
593
 
@@ -607,7 +616,7 @@ int main(int argc, char ** argv) {
607
616
  params.warmup = false;
608
617
 
609
618
  // init
610
- llama_init_result llama_init = llama_init_from_gpt_params(params);
619
+ common_init_result llama_init = common_init_from_params(params);
611
620
 
612
621
  llama_model * model = llama_init.model;
613
622
  llama_context * ctx = llama_init.context;
@@ -625,13 +634,22 @@ int main(int argc, char ** argv) {
625
634
  // print system information
626
635
  {
627
636
  LOG_INF("\n");
628
- LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
637
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
629
638
  }
630
639
 
631
- if (!compute_imatrix(ctx, params)) {
632
- return 1;
640
+ if (params.prompt.empty()) {
641
+ if (params.in_files.empty()) {
642
+ LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
643
+ return 1;
644
+ }
645
+ LOG_INF("No prompt provided; combining precomputed matrices only.\n");
646
+ } else {
647
+ if (!compute_imatrix(ctx, params)) {
648
+ return 1;
649
+ }
633
650
  }
634
651
 
652
+
635
653
  g_collector.save_imatrix();
636
654
 
637
655
  LOG("\n");
@@ -2,4 +2,4 @@ set(TARGET llama-infill)
2
2
  add_executable(${TARGET} infill.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -35,58 +35,14 @@
35
35
 
36
36
  static llama_context ** g_ctx;
37
37
  static llama_model ** g_model;
38
- static gpt_sampler ** g_smpl;
39
- static gpt_params * g_params;
38
+ static common_sampler ** g_smpl;
39
+ static common_params * g_params;
40
40
  static std::vector<llama_token> * g_input_tokens;
41
41
  static std::ostringstream * g_output_ss;
42
42
  static std::vector<llama_token> * g_output_tokens;
43
43
 
44
44
  static bool is_interacting = false;
45
45
 
46
- static void write_logfile(
47
- const llama_context * ctx, const gpt_params & params, const llama_model * model,
48
- const std::vector<llama_token> & input_tokens, const std::string & output,
49
- const std::vector<llama_token> & output_tokens
50
- ) {
51
- if (params.logdir.empty()) {
52
- return;
53
- }
54
-
55
- const std::string timestamp = string_get_sortable_timestamp();
56
-
57
- const bool success = fs_create_directory_with_parents(params.logdir);
58
- if (!success) {
59
- LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
60
- __func__, params.logdir.c_str());
61
- return;
62
- }
63
-
64
- const std::string logfile_path = params.logdir + timestamp + ".yml";
65
- FILE * logfile = fopen(logfile_path.c_str(), "w");
66
-
67
- if (logfile == NULL) {
68
- LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
69
- return;
70
- }
71
-
72
- fprintf(logfile, "binary: infill\n");
73
- char model_desc[128];
74
- llama_model_desc(model, model_desc, sizeof(model_desc));
75
- yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
76
-
77
- fprintf(logfile, "\n");
78
- fprintf(logfile, "######################\n");
79
- fprintf(logfile, "# Generation Results #\n");
80
- fprintf(logfile, "######################\n");
81
- fprintf(logfile, "\n");
82
-
83
- yaml_dump_string_multiline(logfile, "output", output.c_str());
84
- yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
85
-
86
- llama_perf_dump_yaml(logfile, ctx);
87
- fclose(logfile);
88
- }
89
-
90
46
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
91
47
  static void sigint_handler(int signo) {
92
48
  if (signo == SIGINT) {
@@ -95,12 +51,11 @@ static void sigint_handler(int signo) {
95
51
  } else {
96
52
  console::cleanup();
97
53
  LOG("\n");
98
- gpt_perf_print(*g_ctx, *g_smpl);
99
- write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
54
+ common_perf_print(*g_ctx, *g_smpl);
100
55
 
101
56
  // make sure all logs are flushed
102
57
  LOG("Interrupted by user\n");
103
- gpt_log_pause(gpt_log_main());
58
+ common_log_pause(common_log_main());
104
59
 
105
60
  _exit(130);
106
61
  }
@@ -109,16 +64,16 @@ static void sigint_handler(int signo) {
109
64
  #endif
110
65
 
111
66
  int main(int argc, char ** argv) {
112
- gpt_params params;
67
+ common_params params;
113
68
  g_params = &params;
114
69
 
115
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
70
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
116
71
  return 1;
117
72
  }
118
73
 
119
- gpt_init();
74
+ common_init();
120
75
 
121
- auto & sparams = params.sparams;
76
+ auto & sparams = params.sampling;
122
77
 
123
78
  console::init(params.simple_io, params.use_color);
124
79
  atexit([]() { console::cleanup(); });
@@ -166,7 +121,7 @@ int main(int argc, char ** argv) {
166
121
 
167
122
  llama_model * model = nullptr;
168
123
  llama_context * ctx = nullptr;
169
- gpt_sampler * smpl = nullptr;
124
+ common_sampler * smpl = nullptr;
170
125
 
171
126
  g_model = &model;
172
127
  g_ctx = &ctx;
@@ -174,7 +129,7 @@ int main(int argc, char ** argv) {
174
129
 
175
130
  // load the model and apply lora adapter, if any
176
131
  LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
177
- llama_init_result llama_init = llama_init_from_gpt_params(params);
132
+ common_init_result llama_init = common_init_from_params(params);
178
133
 
179
134
  model = llama_init.model;
180
135
  ctx = llama_init.context;
@@ -195,21 +150,21 @@ int main(int argc, char ** argv) {
195
150
  // print system information
196
151
  {
197
152
  LOG_INF("\n");
198
- LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
153
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
199
154
  }
200
155
  const bool add_bos = llama_add_bos_token(model);
201
156
  GGML_ASSERT(!llama_add_eos_token(model));
202
157
 
203
158
  std::vector<llama_token> embd_inp;
204
159
  std::vector<llama_token> embd_end;
205
- std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
206
- std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
160
+ std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
161
+ std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
207
162
 
208
- GGML_ASSERT(llama_token_prefix(model) >= 0);
209
- GGML_ASSERT(llama_token_suffix(model) >= 0);
163
+ GGML_ASSERT(llama_token_fim_pre(model) >= 0);
164
+ GGML_ASSERT(llama_token_fim_suf(model) >= 0);
210
165
 
211
- inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
212
- inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
166
+ inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
167
+ inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
213
168
 
214
169
  embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
215
170
  embd_end = params.spm_infill ? inp_pfx : inp_sfx;
@@ -218,7 +173,7 @@ int main(int argc, char ** argv) {
218
173
  }
219
174
  embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
220
175
 
221
- const llama_token middle_token = llama_token_middle(model);
176
+ const llama_token middle_token = llama_token_fim_mid(model);
222
177
  if (middle_token >= 0) {
223
178
  embd_inp.push_back(middle_token);
224
179
  }
@@ -257,13 +212,13 @@ int main(int argc, char ** argv) {
257
212
  LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
258
213
  LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
259
214
  for (int i = 0; i < (int) embd_inp.size(); i++) {
260
- LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
215
+ LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
261
216
  }
262
217
 
263
218
  if (params.n_keep > 0) {
264
219
  LOG_INF("%s: static prompt based on n_keep: '", __func__);
265
220
  for (int i = 0; i < params.n_keep; i++) {
266
- LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
221
+ LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
267
222
  }
268
223
  LOG_CNT("'\n");
269
224
  }
@@ -298,11 +253,11 @@ int main(int argc, char ** argv) {
298
253
  LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
299
254
  }
300
255
  }
301
- smpl = gpt_sampler_init(model, sparams);
256
+ smpl = common_sampler_init(model, sparams);
302
257
 
303
- LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
258
+ LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
304
259
  LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
305
- LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
260
+ LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
306
261
 
307
262
  LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
308
263
 
@@ -396,7 +351,7 @@ int main(int argc, char ** argv) {
396
351
 
397
352
  LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
398
353
 
399
- if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
354
+ if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
400
355
  LOG_ERR("%s : failed to eval\n", __func__);
401
356
  return 1;
402
357
  }
@@ -411,9 +366,9 @@ int main(int argc, char ** argv) {
411
366
  embd.clear();
412
367
 
413
368
  if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
414
- const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
369
+ const llama_token id = common_sampler_sample(smpl, ctx, -1);
415
370
 
416
- gpt_sampler_accept(smpl, id, true);
371
+ common_sampler_accept(smpl, id, true);
417
372
 
418
373
  // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
419
374
 
@@ -434,7 +389,7 @@ int main(int argc, char ** argv) {
434
389
 
435
390
  // push the prompt in the sampling context in order to apply repetition penalties later
436
391
  // for the prompt, we don't apply grammar rules
437
- gpt_sampler_accept(smpl, embd_inp[n_consumed], false);
392
+ common_sampler_accept(smpl, embd_inp[n_consumed], false);
438
393
 
439
394
  ++n_consumed;
440
395
  if ((int) embd.size() >= params.n_batch) {
@@ -446,7 +401,7 @@ int main(int argc, char ** argv) {
446
401
  // display text
447
402
  if (input_echo) {
448
403
  for (auto id : embd) {
449
- const std::string token_str = llama_token_to_piece(ctx, id);
404
+ const std::string token_str = common_token_to_piece(ctx, id);
450
405
  LOG("%s", token_str.c_str());
451
406
 
452
407
  if (embd.size() > 1) {
@@ -465,10 +420,10 @@ int main(int argc, char ** argv) {
465
420
  // if not currently processing queued inputs;
466
421
  if ((int) embd_inp.size() <= n_consumed) {
467
422
  // deal with eot token in infill mode
468
- if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
423
+ if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
469
424
  if (is_interacting && !params.interactive_first) {
470
425
  // print an eot token
471
- LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
426
+ LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
472
427
  }
473
428
  LOG("\n");
474
429
  console::set_display(console::user_input);
@@ -505,11 +460,11 @@ int main(int argc, char ** argv) {
505
460
  }
506
461
 
507
462
  // tokenize new prefix and suffix
508
- std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
509
- std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
463
+ std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
464
+ std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
510
465
 
511
- inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
512
- inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
466
+ inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
467
+ inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
513
468
 
514
469
  embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
515
470
  embd_end = params.spm_infill ? inp_pfx : inp_sfx;
@@ -529,7 +484,7 @@ int main(int argc, char ** argv) {
529
484
  is_interacting = false;
530
485
  }
531
486
  // deal with end of generation tokens in interactive mode
532
- else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
487
+ else if (llama_token_is_eog(model, common_sampler_last(smpl))) {
533
488
  LOG_DBG("found EOS token\n");
534
489
 
535
490
  if (params.interactive) {
@@ -579,7 +534,7 @@ int main(int argc, char ** argv) {
579
534
 
580
535
  const size_t original_size = embd_inp.size();
581
536
 
582
- const auto line_inp = ::llama_tokenize(ctx, buffer, false);
537
+ const auto line_inp = common_tokenize(ctx, buffer, false);
583
538
  LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
584
539
 
585
540
  embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
@@ -587,7 +542,7 @@ int main(int argc, char ** argv) {
587
542
  for (size_t i = original_size; i < embd_inp.size(); ++i) {
588
543
  const llama_token token = embd_inp[i];
589
544
  output_tokens.push_back(token);
590
- output_ss << llama_token_to_piece(ctx, token);
545
+ output_ss << common_token_to_piece(ctx, token);
591
546
  }
592
547
 
593
548
  n_remain -= line_inp.size();
@@ -601,7 +556,7 @@ int main(int argc, char ** argv) {
601
556
 
602
557
  if (n_past > 0) {
603
558
  if (is_interacting) {
604
- gpt_sampler_reset(smpl);
559
+ common_sampler_reset(smpl);
605
560
  }
606
561
  is_interacting = false;
607
562
  }
@@ -620,17 +575,16 @@ int main(int argc, char ** argv) {
620
575
  }
621
576
  }
622
577
  if (!params.interactive && n_remain <= 0) {
623
- LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
578
+ LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
624
579
  }
625
580
 
626
581
  LOG("\n");
627
- gpt_perf_print(ctx, smpl);
628
- write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
582
+ common_perf_print(ctx, smpl);
629
583
 
630
584
  llama_free(ctx);
631
585
  llama_free_model(model);
632
586
 
633
- gpt_sampler_free(smpl);
587
+ common_sampler_free(smpl);
634
588
  llama_backend_free();
635
589
 
636
590
  return 0;
@@ -2,4 +2,4 @@ set(TARGET llama-bench)
2
2
  add_executable(${TARGET} llama-bench.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)