@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -37,13 +37,13 @@ struct ngram_container {
37
37
  };
38
38
 
39
39
  int main(int argc, char ** argv) {
40
- gpt_params params;
40
+ common_params params;
41
41
 
42
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
42
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
43
43
  return 1;
44
44
  }
45
45
 
46
- gpt_init();
46
+ common_init();
47
47
 
48
48
  const int W = 15; // lookahead window
49
49
  const int N = 5; // n-gram size
@@ -56,7 +56,7 @@ int main(int argc, char ** argv) {
56
56
  llama_numa_init(params.numa);
57
57
 
58
58
  // load the target model
59
- llama_init_result llama_init = llama_init_from_gpt_params(params);
59
+ common_init_result llama_init = common_init_from_params(params);
60
60
 
61
61
  llama_model * model = llama_init.model;
62
62
  llama_context * ctx = llama_init.context;
@@ -65,7 +65,7 @@ int main(int argc, char ** argv) {
65
65
  std::vector<llama_token> inp;
66
66
  std::vector<llama_token> all;
67
67
 
68
- inp = ::llama_tokenize(ctx, params.prompt, true, true);
68
+ inp = common_tokenize(ctx, params.prompt, true, true);
69
69
  all = inp;
70
70
 
71
71
  const int max_context_size = llama_n_ctx(ctx);
@@ -79,7 +79,7 @@ int main(int argc, char ** argv) {
79
79
  LOG("\n\n");
80
80
 
81
81
  for (auto id : inp) {
82
- LOG("%s", llama_token_to_piece(ctx, id).c_str());
82
+ LOG("%s", common_token_to_piece(ctx, id).c_str());
83
83
  }
84
84
 
85
85
  fflush(stderr);
@@ -89,8 +89,8 @@ int main(int argc, char ** argv) {
89
89
  const auto t_enc_start = ggml_time_us();
90
90
 
91
91
  // eval the prompt
92
- llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
93
- llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
92
+ llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
93
+ llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
94
94
 
95
95
  for (int s = 1; s < W + G + 1; ++s) {
96
96
  llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
@@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
115
115
  llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
116
116
 
117
117
  // target model sampling context
118
- struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
118
+ struct common_sampler * smpl = common_sampler_init(model, params.sampling);
119
119
 
120
120
  // verification n-grams
121
121
  std::vector<ngram_data> ngrams_cur(G);
@@ -156,12 +156,12 @@ int main(int argc, char ** argv) {
156
156
 
157
157
  // sample first token
158
158
  {
159
- id = gpt_sampler_sample(smpl, ctx, 0);
159
+ id = common_sampler_sample(smpl, ctx, 0);
160
160
 
161
- gpt_sampler_accept(smpl, id, true);
161
+ common_sampler_accept(smpl, id, true);
162
162
 
163
163
  {
164
- const std::string token_str = llama_token_to_piece(ctx, id);
164
+ const std::string token_str = common_token_to_piece(ctx, id);
165
165
 
166
166
  LOG("%s", token_str.c_str());
167
167
  fflush(stdout);
@@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
172
172
  // debug
173
173
  if (dump_kv_cache) {
174
174
  llama_kv_cache_view_update(ctx, &kvc_view);
175
- llama_kv_cache_dump_view_seqs(kvc_view, 40);
175
+ common_kv_cache_dump_view_seqs(kvc_view, 40);
176
176
  }
177
177
 
178
178
  // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
@@ -201,10 +201,10 @@ int main(int argc, char ** argv) {
201
201
  // V V V V V V
202
202
  // id
203
203
  {
204
- llama_batch_clear(batch);
204
+ common_batch_clear(batch);
205
205
 
206
206
  // current token - first token of the first level
207
- llama_batch_add(batch, id, n_past, seq_id_all, true);
207
+ common_batch_add(batch, id, n_past, seq_id_all, true);
208
208
 
209
209
  // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
210
210
  {
@@ -229,7 +229,7 @@ int main(int argc, char ** argv) {
229
229
  ngrams_cur[g].tokens [j + 1] = t;
230
230
  ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
231
231
 
232
- llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
232
+ common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
233
233
  }
234
234
  }
235
235
  }
@@ -241,13 +241,13 @@ int main(int argc, char ** argv) {
241
241
  seq_id_look[j] = i + j + 1;
242
242
  }
243
243
 
244
- llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
244
+ common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
245
245
  }
246
246
 
247
247
  // fill the rest of the levels
248
248
  for (int j = 1; j < N - 1; j++) {
249
249
  for (int i = 0; i < W; i++) {
250
- llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
250
+ common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
251
251
  }
252
252
  }
253
253
  }
@@ -281,13 +281,13 @@ int main(int argc, char ** argv) {
281
281
  }
282
282
 
283
283
  // sample the next token
284
- id = gpt_sampler_sample(smpl, ctx, i_batch);
284
+ id = common_sampler_sample(smpl, ctx, i_batch);
285
285
 
286
- gpt_sampler_accept(smpl, id, true);
286
+ common_sampler_accept(smpl, id, true);
287
287
 
288
288
  // print
289
289
  {
290
- const std::string token_str = llama_token_to_piece(ctx, id);
290
+ const std::string token_str = common_token_to_piece(ctx, id);
291
291
 
292
292
  if (v == 0) {
293
293
  LOG("%s", token_str.c_str());
@@ -327,7 +327,7 @@ int main(int argc, char ** argv) {
327
327
  // print known n-grams starting with token id (debug)
328
328
  if (0 && v == 0) {
329
329
  if (ngrams_observed.cnt[id] > 0) {
330
- LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
330
+ LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str());
331
331
  }
332
332
 
333
333
  for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
@@ -336,7 +336,7 @@ int main(int argc, char ** argv) {
336
336
  const int idx = id*(N - 1)*G + i*(N - 1);
337
337
 
338
338
  for (int j = 0; j < N - 1; j++) {
339
- const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
339
+ const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
340
340
 
341
341
  LOG("%s", token_str.c_str());
342
342
  }
@@ -358,7 +358,7 @@ int main(int argc, char ** argv) {
358
358
  if (v == 0) {
359
359
  // sample from the last level
360
360
  for (int i = 0; i < W; i++) {
361
- tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
361
+ tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
362
362
  }
363
363
  } else {
364
364
  for (int i = 0; i < W; i++) {
@@ -466,9 +466,9 @@ int main(int argc, char ** argv) {
466
466
  LOG_INF("n_accept = %d\n", n_accept);
467
467
 
468
468
  LOG_INF("\n");
469
- gpt_perf_print(ctx, smpl);
469
+ common_perf_print(ctx, smpl);
470
470
 
471
- gpt_sampler_free(smpl);
471
+ common_sampler_free(smpl);
472
472
 
473
473
  llama_kv_cache_view_free(&kvc_view);
474
474
 
@@ -2,22 +2,22 @@ set(TARGET llama-lookup)
2
2
  add_executable(${TARGET} lookup.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
6
6
 
7
7
  set(TARGET llama-lookup-create)
8
8
  add_executable(${TARGET} lookup-create.cpp)
9
9
  install(TARGETS ${TARGET} RUNTIME)
10
10
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
11
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
11
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
12
12
 
13
13
  set(TARGET llama-lookup-merge)
14
14
  add_executable(${TARGET} lookup-merge.cpp)
15
15
  install(TARGETS ${TARGET} RUNTIME)
16
16
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
17
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
17
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
18
18
 
19
19
  set(TARGET llama-lookup-stats)
20
20
  add_executable(${TARGET} lookup-stats.cpp)
21
21
  install(TARGETS ${TARGET} RUNTIME)
22
22
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
23
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
23
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -12,9 +12,9 @@
12
12
  #include <vector>
13
13
 
14
14
  int main(int argc, char ** argv){
15
- gpt_params params;
15
+ common_params params;
16
16
 
17
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
17
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
18
18
  return 1;
19
19
  }
20
20
 
@@ -23,7 +23,7 @@ int main(int argc, char ** argv){
23
23
  llama_numa_init(params.numa);
24
24
 
25
25
  // load the model
26
- llama_init_result llama_init = llama_init_from_gpt_params(params);
26
+ common_init_result llama_init = common_init_from_params(params);
27
27
 
28
28
  llama_model * model = llama_init.model;
29
29
  llama_context * ctx = llama_init.context;
@@ -31,15 +31,15 @@ int main(int argc, char ** argv){
31
31
 
32
32
  // tokenize the prompt
33
33
  std::vector<llama_token> inp;
34
- inp = ::llama_tokenize(ctx, params.prompt, true, true);
34
+ inp = common_tokenize(ctx, params.prompt, true, true);
35
35
  fprintf(stderr, "%s: tokenization done\n", __func__);
36
36
 
37
37
 
38
- llama_ngram_cache ngram_cache;
39
- llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
38
+ common_ngram_cache ngram_cache;
39
+ common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
40
40
  fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
41
41
 
42
- llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
42
+ common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
43
43
 
44
44
  return 0;
45
45
  }
@@ -33,15 +33,15 @@ int main(int argc, char ** argv){
33
33
  }
34
34
 
35
35
  fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
36
- llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
36
+ common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]);
37
37
 
38
38
  for (size_t i = 1; i < args.size()-1; ++i) {
39
39
  fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
40
- llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
40
+ common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]);
41
41
 
42
- llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
42
+ common_ngram_cache_merge(ngram_cache_merged, ngram_cache);
43
43
  }
44
44
 
45
45
  fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
46
- llama_ngram_cache_save(ngram_cache_merged, args.back());
46
+ common_ngram_cache_save(ngram_cache_merged, args.back());
47
47
  }
@@ -13,33 +13,34 @@
13
13
  #include <vector>
14
14
 
15
15
  int main(int argc, char ** argv){
16
- gpt_params params;
16
+ common_params params;
17
17
 
18
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
18
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
19
19
  return 1;
20
20
  }
21
21
 
22
- gpt_init();
22
+ common_init();
23
23
 
24
- const int n_draft = params.n_draft;
24
+ const int n_draft = params.speculative.n_max;
25
25
 
26
26
  // init llama.cpp
27
27
  llama_backend_init();
28
28
  llama_numa_init(params.numa);
29
29
 
30
30
  // load the model
31
- llama_init_result llama_init = llama_init_from_gpt_params(params);
31
+ common_init_result llama_init = common_init_from_params(params);
32
32
 
33
33
  llama_model * model = llama_init.model;
34
34
  llama_context * ctx = llama_init.context;
35
35
 
36
36
  // tokenize the prompt
37
37
  std::vector<llama_token> inp;
38
- inp = ::llama_tokenize(ctx, params.prompt, true, true);
38
+ inp = common_tokenize(ctx, params.prompt, true, true);
39
+
40
+ common_ngram_cache ngram_cache_context;
41
+ common_ngram_cache ngram_cache_dynamic;
42
+ common_ngram_cache ngram_cache_static;
39
43
 
40
- llama_ngram_cache ngram_cache_context;
41
- llama_ngram_cache ngram_cache_dynamic;
42
- llama_ngram_cache ngram_cache_static;
43
44
  int64_t t_draft_flat_us = 0;
44
45
  int64_t t_draft_us = 0;
45
46
 
@@ -48,7 +49,7 @@ int main(int argc, char ** argv){
48
49
 
49
50
  if (!params.lookup_cache_static.empty()) {
50
51
  try {
51
- ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
52
+ ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
52
53
  } catch (std::ifstream::failure const &) {
53
54
  LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
54
55
  exit(1);
@@ -57,7 +58,7 @@ int main(int argc, char ** argv){
57
58
 
58
59
  if (!params.lookup_cache_dynamic.empty()) {
59
60
  try {
60
- ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
61
+ ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
61
62
  } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
62
63
  }
63
64
 
@@ -86,7 +87,7 @@ int main(int argc, char ** argv){
86
87
 
87
88
  {
88
89
  const int64_t t_start_draft_us = ggml_time_us();
89
- llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
90
+ common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
90
91
  t_draft_us += ggml_time_us() - t_start_draft_us;
91
92
  }
92
93
 
@@ -105,7 +106,7 @@ int main(int argc, char ** argv){
105
106
 
106
107
  {
107
108
  const int64_t t_start_draft_us = ggml_time_us();
108
- llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
109
+ common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
109
110
  t_draft_us += ggml_time_us() - t_start_draft_us;
110
111
  }
111
112
  }
@@ -115,7 +116,7 @@ int main(int argc, char ** argv){
115
116
  pseudo_output.push_back(inp_slice[pseudo_output.size()]);
116
117
  {
117
118
  const int64_t t_start_draft_us = ggml_time_us();
118
- llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
119
+ common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
119
120
  t_draft_us += ggml_time_us() - t_start_draft_us;
120
121
  }
121
122
  }
@@ -133,7 +134,7 @@ int main(int argc, char ** argv){
133
134
  }
134
135
 
135
136
  // After each chunk, update the dynamic ngram cache with the context ngram cache:
136
- llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
137
+ common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
137
138
  ngram_cache_context.clear();
138
139
  }
139
140
 
@@ -13,16 +13,16 @@
13
13
  #include <vector>
14
14
 
15
15
  int main(int argc, char ** argv){
16
- gpt_params params;
16
+ common_params params;
17
17
 
18
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
18
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
19
19
  return 1;
20
20
  }
21
21
 
22
- gpt_init();
22
+ common_init();
23
23
 
24
24
  // max. number of additional tokens to draft if match is found
25
- const int n_draft = params.n_draft;
25
+ const int n_draft = params.speculative.n_max;
26
26
 
27
27
  const bool dump_kv_cache = params.dump_kv_cache;
28
28
 
@@ -31,29 +31,29 @@ int main(int argc, char ** argv){
31
31
  llama_numa_init(params.numa);
32
32
 
33
33
  // load the model
34
- llama_init_result llama_init = llama_init_from_gpt_params(params);
34
+ common_init_result llama_init = common_init_from_params(params);
35
35
 
36
36
  llama_model * model = llama_init.model;
37
37
  llama_context * ctx = llama_init.context;
38
38
 
39
39
  // tokenize the prompt
40
40
  std::vector<llama_token> inp;
41
- inp = ::llama_tokenize(ctx, params.prompt, true, true);
41
+ inp = common_tokenize(ctx, params.prompt, true, true);
42
42
 
43
- llama_ngram_cache ngram_cache_context;
44
- llama_ngram_cache ngram_cache_dynamic;
45
- llama_ngram_cache ngram_cache_static;
43
+ common_ngram_cache ngram_cache_context;
44
+ common_ngram_cache ngram_cache_dynamic;
45
+ common_ngram_cache ngram_cache_static;
46
46
  int64_t t_draft_flat_us = 0;
47
47
  int64_t t_draft_us = 0;
48
48
 
49
49
  {
50
50
  // Fill up context ngram cache with tokens from user input:
51
51
  const int64_t t_start_draft_us = ggml_time_us();
52
- llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
52
+ common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
53
53
 
54
54
  if (!params.lookup_cache_static.empty()) {
55
55
  try {
56
- ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
56
+ ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
57
57
  } catch (std::ifstream::failure const &) {
58
58
  LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
59
59
  exit(1);
@@ -62,7 +62,7 @@ int main(int argc, char ** argv){
62
62
 
63
63
  if (!params.lookup_cache_dynamic.empty()) {
64
64
  try {
65
- ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
65
+ ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
66
66
  } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
67
67
  }
68
68
 
@@ -80,7 +80,7 @@ int main(int argc, char ** argv){
80
80
  LOG("\n\n");
81
81
 
82
82
  for (auto id : inp) {
83
- LOG("%s", llama_token_to_piece(ctx, id).c_str());
83
+ LOG("%s", common_token_to_piece(ctx, id).c_str());
84
84
  }
85
85
 
86
86
  fflush(stderr);
@@ -89,8 +89,8 @@ int main(int argc, char ** argv){
89
89
 
90
90
  const auto t_enc_start = ggml_time_us();
91
91
 
92
- llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
93
- llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
92
+ llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
93
+ llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
94
94
 
95
95
  const auto t_enc_end = ggml_time_us();
96
96
 
@@ -102,7 +102,7 @@ int main(int argc, char ** argv){
102
102
 
103
103
  bool has_eos = false;
104
104
 
105
- struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
105
+ struct common_sampler * smpl = common_sampler_init(model, params.sampling);
106
106
 
107
107
  std::vector<llama_token> draft;
108
108
 
@@ -117,7 +117,7 @@ int main(int argc, char ** argv){
117
117
  // debug
118
118
  if (dump_kv_cache) {
119
119
  llama_kv_cache_view_update(ctx, &kvc_view);
120
- llama_kv_cache_dump_view_seqs(kvc_view, 40);
120
+ common_kv_cache_dump_view_seqs(kvc_view, 40);
121
121
  }
122
122
 
123
123
  // print current draft sequence
@@ -126,11 +126,11 @@ int main(int argc, char ** argv){
126
126
  int i_dft = 0;
127
127
  while (true) {
128
128
  // sample from the target model
129
- llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);
129
+ llama_token id = common_sampler_sample(smpl, ctx, i_dft);
130
130
 
131
- gpt_sampler_accept(smpl, id, true);
131
+ common_sampler_accept(smpl, id, true);
132
132
 
133
- const std::string token_str = llama_token_to_piece(ctx, id);
133
+ const std::string token_str = common_token_to_piece(ctx, id);
134
134
 
135
135
  if (!params.use_color) {
136
136
  LOG("%s", token_str.c_str());
@@ -152,7 +152,7 @@ int main(int argc, char ** argv){
152
152
  {
153
153
  // Update context ngram cache with the newly accepted token:
154
154
  const int64_t t_start_draft_us = ggml_time_us();
155
- llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
155
+ common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
156
156
  t_draft_us += ggml_time_us() - t_start_draft_us;
157
157
  }
158
158
 
@@ -178,7 +178,7 @@ int main(int argc, char ** argv){
178
178
  {
179
179
  // Update context ngram cache with the newly accepted token:
180
180
  const int64_t t_start_draft_us = ggml_time_us();
181
- llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
181
+ common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
182
182
  t_draft_us += ggml_time_us() - t_start_draft_us;
183
183
  }
184
184
  break;
@@ -192,18 +192,18 @@ int main(int argc, char ** argv){
192
192
  // clean the cache of draft tokens that weren't accepted
193
193
  llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
194
194
 
195
- llama_batch_clear(batch_tgt);
196
- llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
195
+ common_batch_clear(batch_tgt);
196
+ common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
197
197
 
198
198
  // Draft already contains a single token sampled from the model:
199
199
  GGML_ASSERT(draft.size() == 1);
200
200
  GGML_ASSERT(draft[0] == inp.back());
201
201
  const int64_t t_start_draft_us = ggml_time_us();
202
202
 
203
- llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
203
+ common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
204
204
 
205
205
  for (size_t i = 1; i < draft.size(); ++i) {
206
- llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
206
+ common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
207
207
  }
208
208
 
209
209
  t_draft_us += ggml_time_us() - t_start_draft_us;
@@ -218,8 +218,8 @@ int main(int argc, char ** argv){
218
218
  auto t_dec_end = ggml_time_us();
219
219
 
220
220
  // Update dynamic ngram cache with context ngram cache and save it to disk:
221
- llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
222
- llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
221
+ common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
222
+ common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
223
223
 
224
224
  LOG("\n\n");
225
225
 
@@ -237,9 +237,9 @@ int main(int argc, char ** argv){
237
237
  LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
238
238
 
239
239
  LOG_INF("\ntarget:\n\n");
240
- gpt_perf_print(ctx, smpl);
240
+ common_perf_print(ctx, smpl);
241
241
 
242
- gpt_sampler_free(smpl);
242
+ common_sampler_free(smpl);
243
243
 
244
244
  llama_batch_free(batch_tgt);
245
245
 
@@ -2,4 +2,4 @@ set(TARGET llama-cli)
2
2
  add_executable(${TARGET} main.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
4
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)