@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -33,8 +33,8 @@
33
33
 
34
34
  static llama_context ** g_ctx;
35
35
  static llama_model ** g_model;
36
- static gpt_sampler ** g_smpl;
37
- static gpt_params * g_params;
36
+ static common_sampler ** g_smpl;
37
+ static common_params * g_params;
38
38
  static std::vector<llama_token> * g_input_tokens;
39
39
  static std::ostringstream * g_output_ss;
40
40
  static std::vector<llama_token> * g_output_tokens;
@@ -62,49 +62,6 @@ static bool file_is_empty(const std::string & path) {
62
62
  return f.tellg() == 0;
63
63
  }
64
64
 
65
- static void write_logfile(
66
- const llama_context * ctx, const gpt_params & params, const llama_model * model,
67
- const std::vector<llama_token> & input_tokens, const std::string & output,
68
- const std::vector<llama_token> & output_tokens
69
- ) {
70
- if (params.logdir.empty()) {
71
- return;
72
- }
73
-
74
- const std::string timestamp = string_get_sortable_timestamp();
75
-
76
- const bool success = fs_create_directory_with_parents(params.logdir);
77
- if (!success) {
78
- LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
79
- return;
80
- }
81
-
82
- const std::string logfile_path = params.logdir + timestamp + ".yml";
83
- FILE * logfile = fopen(logfile_path.c_str(), "w");
84
-
85
- if (logfile == NULL) {
86
- LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
87
- return;
88
- }
89
-
90
- fprintf(logfile, "binary: main\n");
91
- char model_desc[128];
92
- llama_model_desc(model, model_desc, sizeof(model_desc));
93
- yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
94
-
95
- fprintf(logfile, "\n");
96
- fprintf(logfile, "######################\n");
97
- fprintf(logfile, "# Generation Results #\n");
98
- fprintf(logfile, "######################\n");
99
- fprintf(logfile, "\n");
100
-
101
- yaml_dump_string_multiline(logfile, "output", output.c_str());
102
- yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
103
-
104
- llama_perf_dump_yaml(logfile, ctx);
105
- fclose(logfile);
106
- }
107
-
108
65
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
109
66
  static void sigint_handler(int signo) {
110
67
  if (signo == SIGINT) {
@@ -114,12 +71,11 @@ static void sigint_handler(int signo) {
114
71
  } else {
115
72
  console::cleanup();
116
73
  LOG("\n");
117
- gpt_perf_print(*g_ctx, *g_smpl);
118
- write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
74
+ common_perf_print(*g_ctx, *g_smpl);
119
75
 
120
76
  // make sure all logs are flushed
121
77
  LOG("Interrupted by user\n");
122
- gpt_log_pause(gpt_log_main());
78
+ common_log_pause(common_log_main());
123
79
 
124
80
  _exit(130);
125
81
  }
@@ -127,24 +83,24 @@ static void sigint_handler(int signo) {
127
83
  }
128
84
  #endif
129
85
 
130
- static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
131
- llama_chat_msg new_msg{role, content};
132
- auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
86
+ static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
87
+ common_chat_msg new_msg{role, content};
88
+ auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
133
89
  chat_msgs.push_back({role, content});
134
90
  LOG_DBG("formatted: '%s'\n", formatted.c_str());
135
91
  return formatted;
136
92
  }
137
93
 
138
94
  int main(int argc, char ** argv) {
139
- gpt_params params;
95
+ common_params params;
140
96
  g_params = &params;
141
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
97
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
142
98
  return 1;
143
99
  }
144
100
 
145
- gpt_init();
101
+ common_init();
146
102
 
147
- auto & sparams = params.sparams;
103
+ auto & sparams = params.sampling;
148
104
 
149
105
  // save choice to use color for later
150
106
  // (note for later: this is a slightly awkward choice)
@@ -187,9 +143,9 @@ int main(int argc, char ** argv) {
187
143
 
188
144
  llama_model * model = nullptr;
189
145
  llama_context * ctx = nullptr;
190
- gpt_sampler * smpl = nullptr;
146
+ common_sampler * smpl = nullptr;
191
147
 
192
- std::vector<llama_chat_msg> chat_msgs;
148
+ std::vector<common_chat_msg> chat_msgs;
193
149
 
194
150
  g_model = &model;
195
151
  g_ctx = &ctx;
@@ -197,7 +153,7 @@ int main(int argc, char ** argv) {
197
153
 
198
154
  // load the model and apply lora adapter, if any
199
155
  LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
200
- llama_init_result llama_init = llama_init_from_gpt_params(params);
156
+ common_init_result llama_init = common_init_from_params(params);
201
157
 
202
158
  model = llama_init.model;
203
159
  ctx = llama_init.context;
@@ -209,6 +165,10 @@ int main(int argc, char ** argv) {
209
165
 
210
166
  LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
211
167
 
168
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
169
+ auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
170
+ auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
171
+
212
172
  struct ggml_threadpool_params tpp_batch =
213
173
  ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
214
174
  struct ggml_threadpool_params tpp =
@@ -218,7 +178,7 @@ int main(int argc, char ** argv) {
218
178
 
219
179
  struct ggml_threadpool * threadpool_batch = NULL;
220
180
  if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
221
- threadpool_batch = ggml_threadpool_new(&tpp_batch);
181
+ threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
222
182
  if (!threadpool_batch) {
223
183
  LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
224
184
  return 1;
@@ -228,7 +188,7 @@ int main(int argc, char ** argv) {
228
188
  tpp.paused = true;
229
189
  }
230
190
 
231
- struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
191
+ struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
232
192
  if (!threadpool) {
233
193
  LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
234
194
  return 1;
@@ -246,7 +206,7 @@ int main(int argc, char ** argv) {
246
206
  // print chat template example in conversation mode
247
207
  if (params.conversation) {
248
208
  if (params.enable_chat_template) {
249
- LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
209
+ LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
250
210
  } else {
251
211
  LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
252
212
  }
@@ -255,7 +215,7 @@ int main(int argc, char ** argv) {
255
215
  // print system information
256
216
  {
257
217
  LOG_INF("\n");
258
- LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
218
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
259
219
  LOG_INF("\n");
260
220
  }
261
221
 
@@ -296,7 +256,7 @@ int main(int argc, char ** argv) {
296
256
  : params.prompt;
297
257
  if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
298
258
  LOG_DBG("tokenize the prompt\n");
299
- embd_inp = ::llama_tokenize(ctx, prompt, true, true);
259
+ embd_inp = common_tokenize(ctx, prompt, true, true);
300
260
  } else {
301
261
  LOG_DBG("use session tokens\n");
302
262
  embd_inp = session_tokens;
@@ -379,13 +339,13 @@ int main(int argc, char ** argv) {
379
339
  LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
380
340
  LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
381
341
  for (int i = 0; i < (int) embd_inp.size(); i++) {
382
- LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
342
+ LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
383
343
  }
384
344
 
385
345
  if (params.n_keep > add_bos) {
386
346
  LOG_INF("%s: static prompt based on n_keep: '", __func__);
387
347
  for (int i = 0; i < params.n_keep; i++) {
388
- LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
348
+ LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
389
349
  }
390
350
  LOG_CNT("'\n");
391
351
  }
@@ -415,9 +375,9 @@ int main(int argc, char ** argv) {
415
375
  for (const auto & antiprompt : params.antiprompt) {
416
376
  LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
417
377
  if (params.verbose_prompt) {
418
- auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
378
+ auto tmp = common_tokenize(ctx, antiprompt, false, true);
419
379
  for (int i = 0; i < (int) tmp.size(); i++) {
420
- LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
380
+ LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
421
381
  }
422
382
  }
423
383
  }
@@ -430,9 +390,9 @@ int main(int argc, char ** argv) {
430
390
  if (!params.input_prefix.empty()) {
431
391
  LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
432
392
  if (params.verbose_prompt) {
433
- auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
393
+ auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
434
394
  for (int i = 0; i < (int) tmp.size(); i++) {
435
- LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
395
+ LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
436
396
  }
437
397
  }
438
398
  }
@@ -440,23 +400,23 @@ int main(int argc, char ** argv) {
440
400
  if (!params.input_suffix.empty()) {
441
401
  LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
442
402
  if (params.verbose_prompt) {
443
- auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
403
+ auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
444
404
  for (int i = 0; i < (int) tmp.size(); i++) {
445
- LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
405
+ LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
446
406
  }
447
407
  }
448
408
  }
449
409
  }
450
410
 
451
- smpl = gpt_sampler_init(model, sparams);
411
+ smpl = common_sampler_init(model, sparams);
452
412
  if (!smpl) {
453
413
  LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
454
414
  return 1;
455
415
  }
456
416
 
457
- LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
417
+ LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
458
418
  LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
459
- LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
419
+ LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
460
420
 
461
421
  LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
462
422
 
@@ -521,14 +481,14 @@ int main(int argc, char ** argv) {
521
481
 
522
482
  antiprompt_ids.reserve(params.antiprompt.size());
523
483
  for (const std::string & antiprompt : params.antiprompt) {
524
- antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
484
+ antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true));
525
485
  }
526
486
 
527
487
  if (llama_model_has_encoder(model)) {
528
488
  int enc_input_size = embd_inp.size();
529
489
  llama_token * enc_input_buf = embd_inp.data();
530
490
 
531
- if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
491
+ if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
532
492
  LOG_ERR("%s : failed to eval\n", __func__);
533
493
  return 1;
534
494
  }
@@ -569,30 +529,30 @@ int main(int argc, char ** argv) {
569
529
  if (!params.ctx_shift){
570
530
  LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
571
531
  break;
572
- } else {
573
- if (params.n_predict == -2) {
574
- LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
575
- break;
576
- }
532
+ }
577
533
 
578
- const int n_left = n_past - params.n_keep;
579
- const int n_discard = n_left/2;
534
+ if (params.n_predict == -2) {
535
+ LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
536
+ break;
537
+ }
580
538
 
581
- LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
582
- n_past, n_left, n_ctx, params.n_keep, n_discard);
539
+ const int n_left = n_past - params.n_keep;
540
+ const int n_discard = n_left/2;
583
541
 
584
- llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
585
- llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
542
+ LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
543
+ n_past, n_left, n_ctx, params.n_keep, n_discard);
586
544
 
587
- n_past -= n_discard;
545
+ llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
546
+ llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
588
547
 
589
- LOG_DBG("after swap: n_past = %d\n", n_past);
548
+ n_past -= n_discard;
590
549
 
591
- LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
550
+ LOG_DBG("after swap: n_past = %d\n", n_past);
592
551
 
593
- LOG_DBG("clear session path\n");
594
- path_session.clear();
595
- }
552
+ LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
553
+
554
+ LOG_DBG("clear session path\n");
555
+ path_session.clear();
596
556
  }
597
557
  } else {
598
558
  // context extension via Self-Extend
@@ -648,7 +608,7 @@ int main(int argc, char ** argv) {
648
608
 
649
609
  LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
650
610
 
651
- if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
611
+ if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
652
612
  LOG_ERR("%s : failed to eval\n", __func__);
653
613
  return 1;
654
614
  }
@@ -679,9 +639,9 @@ int main(int argc, char ** argv) {
679
639
  LOG_DBG("saved session to %s\n", path_session.c_str());
680
640
  }
681
641
 
682
- const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
642
+ const llama_token id = common_sampler_sample(smpl, ctx, -1);
683
643
 
684
- gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
644
+ common_sampler_accept(smpl, id, /* accept_grammar= */ true);
685
645
 
686
646
  // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
687
647
 
@@ -702,7 +662,7 @@ int main(int argc, char ** argv) {
702
662
 
703
663
  // push the prompt in the sampling context in order to apply repetition penalties later
704
664
  // for the prompt, we don't apply grammar rules
705
- gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
665
+ common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
706
666
 
707
667
  ++n_consumed;
708
668
  if ((int) embd.size() >= params.n_batch) {
@@ -714,7 +674,7 @@ int main(int argc, char ** argv) {
714
674
  // display text
715
675
  if (input_echo && display) {
716
676
  for (auto id : embd) {
717
- const std::string token_str = llama_token_to_piece(ctx, id, params.special);
677
+ const std::string token_str = common_token_to_piece(ctx, id, params.special);
718
678
 
719
679
  // Console/Stream Output
720
680
  LOG("%s", token_str.c_str());
@@ -743,7 +703,7 @@ int main(int argc, char ** argv) {
743
703
  // check for reverse prompt in the last n_prev tokens
744
704
  if (!params.antiprompt.empty()) {
745
705
  const int n_prev = 32;
746
- const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev);
706
+ const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
747
707
 
748
708
  is_antiprompt = false;
749
709
  // Check if each of the reverse prompts appears at the end of the output.
@@ -765,7 +725,7 @@ int main(int argc, char ** argv) {
765
725
  }
766
726
 
767
727
  // check for reverse prompt using special tokens
768
- llama_token last_token = gpt_sampler_last(smpl);
728
+ llama_token last_token = common_sampler_last(smpl);
769
729
  for (std::vector<llama_token> ids : antiprompt_ids) {
770
730
  if (ids.size() == 1 && last_token == ids[0]) {
771
731
  if (params.interactive) {
@@ -782,13 +742,13 @@ int main(int argc, char ** argv) {
782
742
  }
783
743
 
784
744
  // deal with end of generation tokens in interactive mode
785
- if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
745
+ if (llama_token_is_eog(model, common_sampler_last(smpl))) {
786
746
  LOG_DBG("found an EOG token\n");
787
747
 
788
748
  if (params.interactive) {
789
749
  if (!params.antiprompt.empty()) {
790
750
  // tokenize and inject first reverse prompt
791
- const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
751
+ const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
792
752
  embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
793
753
  is_antiprompt = true;
794
754
  }
@@ -803,8 +763,8 @@ int main(int argc, char ** argv) {
803
763
 
804
764
  // if current token is not EOG, we add it to current assistant message
805
765
  if (params.conversation) {
806
- const auto id = gpt_sampler_last(smpl);
807
- assistant_ss << llama_token_to_piece(ctx, id, false);
766
+ const auto id = common_sampler_last(smpl);
767
+ assistant_ss << common_token_to_piece(ctx, id, false);
808
768
  }
809
769
 
810
770
  if (n_past > 0 && is_interacting) {
@@ -862,9 +822,9 @@ int main(int argc, char ** argv) {
862
822
  ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
863
823
  : std::move(buffer);
864
824
  // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
865
- const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
866
- const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
867
- const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
825
+ const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
826
+ const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat);
827
+ const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
868
828
 
869
829
  LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
870
830
 
@@ -882,7 +842,7 @@ int main(int argc, char ** argv) {
882
842
  for (size_t i = original_size; i < embd_inp.size(); ++i) {
883
843
  const llama_token token = embd_inp[i];
884
844
  output_tokens.push_back(token);
885
- output_ss << llama_token_to_piece(ctx, token);
845
+ output_ss << common_token_to_piece(ctx, token);
886
846
  }
887
847
 
888
848
  // reset assistant message
@@ -899,7 +859,7 @@ int main(int argc, char ** argv) {
899
859
 
900
860
  if (n_past > 0) {
901
861
  if (is_interacting) {
902
- gpt_sampler_reset(smpl);
862
+ common_sampler_reset(smpl);
903
863
  }
904
864
  is_interacting = false;
905
865
  }
@@ -925,18 +885,17 @@ int main(int argc, char ** argv) {
925
885
  }
926
886
 
927
887
  LOG("\n\n");
928
- gpt_perf_print(ctx, smpl);
929
- write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
888
+ common_perf_print(ctx, smpl);
930
889
 
931
- gpt_sampler_free(smpl);
890
+ common_sampler_free(smpl);
932
891
 
933
892
  llama_free(ctx);
934
893
  llama_free_model(model);
935
894
 
936
895
  llama_backend_free();
937
896
 
938
- ggml_threadpool_free(threadpool);
939
- ggml_threadpool_free(threadpool_batch);
897
+ ggml_threadpool_free_fn(threadpool);
898
+ ggml_threadpool_free_fn(threadpool_batch);
940
899
 
941
900
  return 0;
942
901
  }
@@ -29,4 +29,4 @@ add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
29
29
  target_include_directories(${TARGET} PRIVATE ${_common_path})
30
30
  install(TARGETS ${TARGET} RUNTIME)
31
31
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
32
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
32
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -2,4 +2,4 @@ set(TARGET llama-parallel)
2
2
  add_executable(${TARGET} parallel.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -54,7 +54,7 @@ static std::vector<std::string> k_prompts = {
54
54
  struct client {
55
55
  ~client() {
56
56
  if (smpl) {
57
- gpt_sampler_free(smpl);
57
+ common_sampler_free(smpl);
58
58
  }
59
59
  }
60
60
 
@@ -75,7 +75,7 @@ struct client {
75
75
  std::string prompt;
76
76
  std::string response;
77
77
 
78
- struct gpt_sampler * smpl = nullptr;
78
+ struct common_sampler * smpl = nullptr;
79
79
  };
80
80
 
81
81
  static void print_date_time() {
@@ -103,13 +103,13 @@ static std::vector<std::string> split_string(const std::string& input, char deli
103
103
  int main(int argc, char ** argv) {
104
104
  srand(1234);
105
105
 
106
- gpt_params params;
106
+ common_params params;
107
107
 
108
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
108
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
109
109
  return 1;
110
110
  }
111
111
 
112
- gpt_init();
112
+ common_init();
113
113
 
114
114
  // number of simultaneous "clients" to simulate
115
115
  const int32_t n_clients = params.n_parallel;
@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
130
130
  llama_numa_init(params.numa);
131
131
 
132
132
  // load the target model
133
- llama_init_result llama_init = llama_init_from_gpt_params(params);
133
+ common_init_result llama_init = common_init_from_params(params);
134
134
 
135
135
  llama_model * model = llama_init.model;
136
136
  llama_context * ctx = llama_init.context;
@@ -160,11 +160,11 @@ int main(int argc, char ** argv) {
160
160
  for (size_t i = 0; i < clients.size(); ++i) {
161
161
  auto & client = clients[i];
162
162
  client.id = i;
163
- client.smpl = gpt_sampler_init(model, params.sparams);
163
+ client.smpl = common_sampler_init(model, params.sampling);
164
164
  }
165
165
 
166
166
  std::vector<llama_token> tokens_system;
167
- tokens_system = ::llama_tokenize(ctx, k_system, true);
167
+ tokens_system = common_tokenize(ctx, k_system, true);
168
168
  const int32_t n_tokens_system = tokens_system.size();
169
169
 
170
170
  llama_seq_id g_seq_id = 0;
@@ -189,7 +189,7 @@ int main(int argc, char ** argv) {
189
189
  LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
190
190
 
191
191
  for (int32_t i = 0; i < n_tokens_system; ++i) {
192
- llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
192
+ common_batch_add(batch, tokens_system[i], i, { 0 }, false);
193
193
  }
194
194
 
195
195
  if (llama_decode(ctx, batch) != 0) {
@@ -210,10 +210,10 @@ int main(int argc, char ** argv) {
210
210
  while (true) {
211
211
  if (dump_kv_cache) {
212
212
  llama_kv_cache_view_update(ctx, &kvc_view);
213
- llama_kv_cache_dump_view_seqs(kvc_view, 40);
213
+ common_kv_cache_dump_view_seqs(kvc_view, 40);
214
214
  }
215
215
 
216
- llama_batch_clear(batch);
216
+ common_batch_clear(batch);
217
217
 
218
218
  // decode any currently ongoing sequences
219
219
  for (auto & client : clients) {
@@ -223,7 +223,7 @@ int main(int argc, char ** argv) {
223
223
 
224
224
  client.i_batch = batch.n_tokens;
225
225
 
226
- llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
226
+ common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
227
227
 
228
228
  client.n_decoded += 1;
229
229
  }
@@ -252,14 +252,14 @@ int main(int argc, char ** argv) {
252
252
  client.prompt = client.input + "\nAssistant:";
253
253
  client.response = "";
254
254
 
255
- gpt_sampler_reset(client.smpl);
255
+ common_sampler_reset(client.smpl);
256
256
 
257
257
  // do not prepend BOS because we have a system prompt!
258
258
  std::vector<llama_token> tokens_prompt;
259
- tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
259
+ tokens_prompt = common_tokenize(ctx, client.prompt, false);
260
260
 
261
261
  for (size_t i = 0; i < tokens_prompt.size(); ++i) {
262
- llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
262
+ common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
263
263
  }
264
264
 
265
265
  // extract the logits only for the last token
@@ -308,7 +308,6 @@ int main(int argc, char ** argv) {
308
308
  batch.n_seq_id + i,
309
309
  batch.seq_id + i,
310
310
  batch.logits + i,
311
- 0, 0, 0, // unused
312
311
  };
313
312
 
314
313
  const int ret = llama_decode(ctx, batch_view);
@@ -340,9 +339,9 @@ int main(int argc, char ** argv) {
340
339
  //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
341
340
  // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
342
341
 
343
- const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i);
342
+ const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i);
344
343
 
345
- gpt_sampler_accept(client.smpl, id, true);
344
+ common_sampler_accept(client.smpl, id, true);
346
345
 
347
346
  if (client.n_decoded == 1) {
348
347
  // start measuring generation time after the first token to make sure all concurrent clients
@@ -350,7 +349,7 @@ int main(int argc, char ** argv) {
350
349
  client.t_start_gen = ggml_time_us();
351
350
  }
352
351
 
353
- const std::string token_str = llama_token_to_piece(ctx, id);
352
+ const std::string token_str = common_token_to_piece(ctx, id);
354
353
 
355
354
  client.response += token_str;
356
355
  client.sampled = id;
@@ -2,4 +2,4 @@ set(TARGET llama-passkey)
2
2
  add_executable(${TARGET} passkey.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)