@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -24,22 +24,24 @@
24
24
 
25
25
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
26
26
 
27
- struct llama_lora_adapter_info {
27
+ struct common_lora_adapter_info {
28
28
  std::string path;
29
29
  float scale;
30
30
  };
31
31
 
32
- struct llama_lora_adapter_container : llama_lora_adapter_info {
32
+ struct common_lora_adapter_container : common_lora_adapter_info {
33
33
  struct llama_lora_adapter * adapter;
34
34
  };
35
35
 
36
+ using llama_tokens = std::vector<llama_token>;
37
+
36
38
  // build info
37
39
  extern int LLAMA_BUILD_NUMBER;
38
- extern char const * LLAMA_COMMIT;
39
- extern char const * LLAMA_COMPILER;
40
- extern char const * LLAMA_BUILD_TARGET;
40
+ extern const char * LLAMA_COMMIT;
41
+ extern const char * LLAMA_COMPILER;
42
+ extern const char * LLAMA_BUILD_TARGET;
41
43
 
42
- struct llama_control_vector_load_info;
44
+ struct common_control_vector_load_info;
43
45
 
44
46
  //
45
47
  // CPU utils
@@ -78,18 +80,23 @@ enum llama_example {
78
80
  LLAMA_EXAMPLE_LLAVA,
79
81
  LLAMA_EXAMPLE_LOOKUP,
80
82
  LLAMA_EXAMPLE_PARALLEL,
83
+ LLAMA_EXAMPLE_TTS,
81
84
 
82
85
  LLAMA_EXAMPLE_COUNT,
83
86
  };
84
87
 
85
- enum gpt_sampler_type {
86
- GPT_SAMPLER_TYPE_NONE = 0,
87
- GPT_SAMPLER_TYPE_TOP_K = 1,
88
- GPT_SAMPLER_TYPE_TOP_P = 2,
89
- GPT_SAMPLER_TYPE_MIN_P = 3,
90
- GPT_SAMPLER_TYPE_TFS_Z = 4,
91
- GPT_SAMPLER_TYPE_TYPICAL_P = 5,
92
- GPT_SAMPLER_TYPE_TEMPERATURE = 6,
88
+ enum common_sampler_type {
89
+ COMMON_SAMPLER_TYPE_NONE = 0,
90
+ COMMON_SAMPLER_TYPE_DRY = 1,
91
+ COMMON_SAMPLER_TYPE_TOP_K = 2,
92
+ COMMON_SAMPLER_TYPE_TOP_P = 3,
93
+ COMMON_SAMPLER_TYPE_MIN_P = 4,
94
+ //COMMON_SAMPLER_TYPE_TFS_Z = 5,
95
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
96
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
97
+ COMMON_SAMPLER_TYPE_XTC = 8,
98
+ COMMON_SAMPLER_TYPE_INFILL = 9,
99
+ COMMON_SAMPLER_TYPE_PENALTIES = 10,
93
100
  };
94
101
 
95
102
  // dimensionality reduction methods, used by cvector-generator
@@ -98,39 +105,49 @@ enum dimre_method {
98
105
  DIMRE_METHOD_MEAN,
99
106
  };
100
107
 
101
- // sampler parameters
102
- struct gpt_sampler_params {
108
+ // sampling parameters
109
+ struct common_params_sampling {
103
110
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
104
111
 
105
- int32_t n_prev = 64; // number of previous tokens to remember
106
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
107
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
108
- int32_t top_k = 40; // <= 0 to use vocab size
109
- float top_p = 0.95f; // 1.0 = disabled
110
- float min_p = 0.05f; // 0.0 = disabled
111
- float tfs_z = 1.00f; // 1.0 = disabled
112
- float typ_p = 1.00f; // typical_p, 1.0 = disabled
113
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
114
- float dynatemp_range = 0.00f; // 0.0 = disabled
115
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
116
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
117
- float penalty_repeat = 1.00f; // 1.0 = disabled
118
- float penalty_freq = 0.00f; // 0.0 = disabled
119
- float penalty_present = 0.00f; // 0.0 = disabled
120
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
121
- float mirostat_tau = 5.00f; // target entropy
122
- float mirostat_eta = 0.10f; // learning rate
123
- bool penalize_nl = false; // consider newlines as a repeatable token
124
- bool ignore_eos = false;
125
- bool no_perf = false; // disable performance metrics
126
-
127
- std::vector<enum gpt_sampler_type> samplers = {
128
- GPT_SAMPLER_TYPE_TOP_K,
129
- GPT_SAMPLER_TYPE_TFS_Z,
130
- GPT_SAMPLER_TYPE_TYPICAL_P,
131
- GPT_SAMPLER_TYPE_TOP_P,
132
- GPT_SAMPLER_TYPE_MIN_P,
133
- GPT_SAMPLER_TYPE_TEMPERATURE
112
+ int32_t n_prev = 64; // number of previous tokens to remember
113
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
114
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
115
+ int32_t top_k = 40; // <= 0 to use vocab size
116
+ float top_p = 0.95f; // 1.0 = disabled
117
+ float min_p = 0.05f; // 0.0 = disabled
118
+ float xtc_probability = 0.00f; // 0.0 = disabled
119
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
120
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
121
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
122
+ float dynatemp_range = 0.00f; // 0.0 = disabled
123
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
124
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
125
+ float penalty_repeat = 1.00f; // 1.0 = disabled
126
+ float penalty_freq = 0.00f; // 0.0 = disabled
127
+ float penalty_present = 0.00f; // 0.0 = disabled
128
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
129
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
130
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
131
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
132
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
133
+ float mirostat_tau = 5.00f; // target entropy
134
+ float mirostat_eta = 0.10f; // learning rate
135
+ bool ignore_eos = false;
136
+ bool no_perf = false; // disable performance metrics
137
+ bool timing_per_token = false;
138
+
139
+ std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
140
+
141
+
142
+ std::vector<enum common_sampler_type> samplers = {
143
+ COMMON_SAMPLER_TYPE_PENALTIES,
144
+ COMMON_SAMPLER_TYPE_DRY,
145
+ COMMON_SAMPLER_TYPE_TOP_K,
146
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
147
+ COMMON_SAMPLER_TYPE_TOP_P,
148
+ COMMON_SAMPLER_TYPE_MIN_P,
149
+ COMMON_SAMPLER_TYPE_XTC,
150
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
134
151
  };
135
152
 
136
153
  std::string grammar; // optional BNF-like grammar to constrain sampling
@@ -141,21 +158,39 @@ struct gpt_sampler_params {
141
158
  std::string print() const;
142
159
  };
143
160
 
144
- struct gpt_params {
161
+ struct common_params_speculative {
162
+ std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
163
+
164
+ int32_t n_ctx = 0; // draft context size
165
+ int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
166
+ int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
167
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
168
+ float p_split = 0.1f; // speculative decoding split probability
169
+ float p_min = 0.9f; // minimum speculative decoding probability (greedy)
170
+
171
+ struct cpu_params cpuparams;
172
+ struct cpu_params cpuparams_batch;
173
+
174
+ std::string model = ""; // draft model for speculative decoding // NOLINT
175
+ };
176
+
177
+ struct common_params_vocoder {
178
+ std::string hf_repo = ""; // HF repo // NOLINT
179
+ std::string hf_file = ""; // HF file // NOLINT
180
+
181
+ std::string model = ""; // model path // NOLINT
182
+ std::string model_url = ""; // model url to download // NOLINT
183
+ };
184
+
185
+ struct common_params {
145
186
  int32_t n_predict = -1; // new tokens to predict
146
- int32_t n_ctx = 0; // context size
187
+ int32_t n_ctx = 4096; // context size
147
188
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
148
189
  int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
149
190
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
150
- int32_t n_draft = 5; // number of tokens to draft during speculative decoding
151
191
  int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
152
192
  int32_t n_parallel = 1; // number of parallel sequences to decode
153
193
  int32_t n_sequences = 1; // number of sequences to decode
154
- float p_split = 0.1f; // speculative decoding split probability
155
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
156
- int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
157
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
158
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
159
194
  int32_t grp_attn_n = 1; // group-attention factor
160
195
  int32_t grp_attn_w = 512; // group-attention width
161
196
  int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
@@ -166,28 +201,35 @@ struct gpt_params {
166
201
  float yarn_beta_fast = 32.0f; // YaRN low correction dim
167
202
  float yarn_beta_slow = 1.0f; // YaRN high correction dim
168
203
  int32_t yarn_orig_ctx = 0; // YaRN original context length
169
- float defrag_thold = -1.0f; // KV cache defragmentation threshold
204
+ float defrag_thold = 0.1f; // KV cache defragmentation threshold
205
+
206
+ // offload params
207
+ std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
208
+
209
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
210
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
211
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
212
+
213
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
170
214
 
171
215
  struct cpu_params cpuparams;
172
216
  struct cpu_params cpuparams_batch;
173
- struct cpu_params draft_cpuparams;
174
- struct cpu_params draft_cpuparams_batch;
175
217
 
176
218
  ggml_backend_sched_eval_callback cb_eval = nullptr;
177
219
  void * cb_eval_user_data = nullptr;
178
220
 
179
221
  ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
180
222
 
181
- enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
182
223
  enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
183
224
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
184
225
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
185
226
 
186
- struct gpt_sampler_params sparams;
227
+ struct common_params_sampling sampling;
228
+ struct common_params_speculative speculative;
229
+ struct common_params_vocoder vocoder;
187
230
 
188
231
  std::string model = ""; // model path // NOLINT
189
- std::string model_draft = ""; // draft model for speculative decoding // NOLINT
190
- std::string model_alias = "unknown"; // model alias // NOLINT
232
+ std::string model_alias = ""; // model alias // NOLINT
191
233
  std::string model_url = ""; // model url to download // NOLINT
192
234
  std::string hf_token = ""; // HF token // NOLINT
193
235
  std::string hf_repo = ""; // HF repo // NOLINT
@@ -197,7 +239,6 @@ struct gpt_params {
197
239
  std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
198
240
  std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
199
241
  std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
200
- std::string logdir = ""; // directory in which to save YAML log files // NOLINT
201
242
  std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
202
243
  std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
203
244
  std::string logits_file = ""; // file for saving *all* logits // NOLINT
@@ -208,9 +249,9 @@ struct gpt_params {
208
249
  std::vector<llama_model_kv_override> kv_overrides;
209
250
 
210
251
  bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
211
- std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
252
+ std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
212
253
 
213
- std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
254
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
214
255
 
215
256
  int32_t verbosity = 0;
216
257
  int32_t control_vector_layer_start = -1; // layer range for control vector
@@ -259,8 +300,8 @@ struct gpt_params {
259
300
  bool warmup = true; // warmup run
260
301
  bool check_tensors = false; // validate tensor data
261
302
 
262
- std::string cache_type_k = "f16"; // KV cache data type for the K
263
- std::string cache_type_v = "f16"; // KV cache data type for the V
303
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
304
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
264
305
 
265
306
  // multimodal models (see examples/llava)
266
307
  std::string mmproj = ""; // path to multimodal projector // NOLINT
@@ -268,21 +309,21 @@ struct gpt_params {
268
309
 
269
310
  // embedding
270
311
  bool embedding = false; // get only sentence embedding
271
- int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
312
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
272
313
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
273
- std::string embd_sep = "\n"; // separator of embendings
314
+ std::string embd_sep = "\n"; // separator of embeddings
274
315
  bool reranking = false; // enable reranking support on server
275
316
 
276
317
  // server params
277
318
  int32_t port = 8080; // server listens on this network port
278
319
  int32_t timeout_read = 600; // http read timeout in seconds
279
320
  int32_t timeout_write = timeout_read; // http write timeout in seconds
280
- int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
321
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
322
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
281
323
 
282
324
  std::string hostname = "127.0.0.1";
283
325
  std::string public_path = ""; // NOLINT
284
326
  std::string chat_template = ""; // NOLINT
285
- std::string system_prompt = ""; // NOLINT
286
327
  bool enable_chat_template = true;
287
328
 
288
329
  std::vector<std::string> api_keys;
@@ -290,7 +331,10 @@ struct gpt_params {
290
331
  std::string ssl_file_key = ""; // NOLINT
291
332
  std::string ssl_file_cert = ""; // NOLINT
292
333
 
293
- bool endpoint_slots = true;
334
+ // "advanced" endpoints are disabled by default for better security
335
+ bool webui = true;
336
+ bool endpoint_slots = false;
337
+ bool endpoint_props = false; // only control POST requests, not GET
294
338
  bool endpoint_metrics = false;
295
339
 
296
340
  bool log_json = false;
@@ -345,20 +389,31 @@ struct gpt_params {
345
389
 
346
390
  // call once at the start of a program if it uses libcommon
347
391
  // initializes the logging system and prints info about the build
348
- void gpt_init();
392
+ void common_init();
349
393
 
350
- std::string gpt_params_get_system_info(const gpt_params & params);
394
+ std::string common_params_get_system_info(const common_params & params);
351
395
 
352
- bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
353
- bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
354
- void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
396
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
397
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
398
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
355
399
  bool set_process_priority(enum ggml_sched_priority prio);
356
400
 
357
401
  //
358
402
  // String utils
359
403
  //
360
404
 
361
- std::vector<std::string> string_split(std::string input, char separator);
405
+ #ifdef __GNUC__
406
+ #ifdef __MINGW32__
407
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
408
+ #else
409
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
410
+ #endif
411
+ #else
412
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
413
+ #endif
414
+
415
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
416
+ std::string string_format(const char * fmt, ...);
362
417
 
363
418
  std::string string_strip(const std::string & str);
364
419
  std::string string_get_sortable_timestamp();
@@ -367,6 +422,7 @@ void string_replace_all(std::string & s, const std::string & search, const std::
367
422
 
368
423
  template<class T>
369
424
  static std::vector<T> string_split(const std::string & str, char delim) {
425
+ static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
370
426
  std::vector<T> values;
371
427
  std::istringstream str_stream(str);
372
428
  std::string token;
@@ -379,6 +435,27 @@ static std::vector<T> string_split(const std::string & str, char delim) {
379
435
  return values;
380
436
  }
381
437
 
438
+ template<>
439
+ std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
440
+ {
441
+ std::vector<std::string> parts;
442
+ size_t begin_pos = 0;
443
+ size_t separator_pos = input.find(separator);
444
+ while (separator_pos != std::string::npos) {
445
+ std::string part = input.substr(begin_pos, separator_pos - begin_pos);
446
+ parts.emplace_back(part);
447
+ begin_pos = separator_pos + 1;
448
+ separator_pos = input.find(separator, begin_pos);
449
+ }
450
+ parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
451
+ return parts;
452
+ }
453
+
454
+ static bool string_starts_with(const std::string & str,
455
+ const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
456
+ return str.rfind(prefix, 0) == 0;
457
+ }
458
+
382
459
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
383
460
  void string_process_escapes(std::string & input);
384
461
 
@@ -401,48 +478,69 @@ std::string fs_get_cache_file(const std::string & filename);
401
478
  // Model utils
402
479
  //
403
480
 
404
- struct llama_init_result {
481
+ struct common_init_result {
405
482
  struct llama_model * model = nullptr;
406
483
  struct llama_context * context = nullptr;
407
- std::vector<llama_lora_adapter_container> lora_adapters;
484
+ std::vector<common_lora_adapter_container> lora_adapters;
408
485
  };
409
486
 
410
- struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
487
+ struct common_init_result common_init_from_params(common_params & params);
411
488
 
412
- struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
413
- struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
489
+ struct llama_model_params common_model_params_to_llama ( common_params & params);
490
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
414
491
  struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
415
492
 
416
- struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
417
- struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
493
+ struct llama_model * common_load_model_from_url(
494
+ const std::string & model_url,
495
+ const std::string & local_path,
496
+ const std::string & hf_token,
497
+ const struct llama_model_params & params);
498
+ struct llama_model * common_load_model_from_hf(
499
+ const std::string & repo,
500
+ const std::string & remote_path,
501
+ const std::string & local_path,
502
+ const std::string & hf_token,
503
+ const struct llama_model_params & params);
418
504
 
419
505
  // clear LoRA adapters from context, then apply new list of adapters
420
- void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
506
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
421
507
 
508
+ //
422
509
  // Batch utils
510
+ //
423
511
 
424
- void llama_batch_clear(struct llama_batch & batch);
512
+ void common_batch_clear(struct llama_batch & batch);
425
513
 
426
- void llama_batch_add(
514
+ void common_batch_add(
427
515
  struct llama_batch & batch,
428
516
  llama_token id,
429
517
  llama_pos pos,
430
518
  const std::vector<llama_seq_id> & seq_ids,
431
519
  bool logits);
432
520
 
521
+ //
522
+ // Token utils
523
+ //
524
+
525
+ // longest common prefix
526
+ size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
527
+
528
+ // longet common subsequence
529
+ size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
530
+
433
531
  //
434
532
  // Vocab utils
435
533
  //
436
534
 
437
535
  // tokenizes a string into a vector of tokens
438
536
  // should work similar to Python's `tokenizer.encode`
439
- std::vector<llama_token> llama_tokenize(
537
+ std::vector<llama_token> common_tokenize(
440
538
  const struct llama_context * ctx,
441
539
  const std::string & text,
442
540
  bool add_special,
443
541
  bool parse_special = false);
444
542
 
445
- std::vector<llama_token> llama_tokenize(
543
+ std::vector<llama_token> common_tokenize(
446
544
  const struct llama_model * model,
447
545
  const std::string & text,
448
546
  bool add_special,
@@ -450,7 +548,7 @@ std::vector<llama_token> llama_tokenize(
450
548
 
451
549
  // tokenizes a token into a piece, optionally renders special/control tokens
452
550
  // should work similar to Python's `tokenizer.id_to_piece`
453
- std::string llama_token_to_piece(
551
+ std::string common_token_to_piece(
454
552
  const struct llama_context * ctx,
455
553
  llama_token token,
456
554
  bool special = true);
@@ -458,7 +556,7 @@ std::string llama_token_to_piece(
458
556
  // detokenizes a vector of tokens into a string
459
557
  // should work similar to Python's `tokenizer.decode`
460
558
  // optionally renders special/control tokens
461
- std::string llama_detokenize(
559
+ std::string common_detokenize(
462
560
  llama_context * ctx,
463
561
  const std::vector<llama_token> & tokens,
464
562
  bool special = true);
@@ -468,31 +566,31 @@ std::string llama_detokenize(
468
566
  //
469
567
 
470
568
  // same with llama_chat_message, but uses std::string
471
- struct llama_chat_msg {
569
+ struct common_chat_msg {
472
570
  std::string role;
473
571
  std::string content;
474
572
  };
475
573
 
476
574
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
477
- bool llama_chat_verify_template(const std::string & tmpl);
575
+ bool common_chat_verify_template(const std::string & tmpl);
478
576
 
479
577
  // CPP wrapper for llama_chat_apply_template
480
578
  // If the built-in template is not supported, we default to chatml
481
579
  // If the custom "tmpl" is not supported, we throw an error
482
- std::string llama_chat_apply_template(const struct llama_model * model,
580
+ std::string common_chat_apply_template(const struct llama_model * model,
483
581
  const std::string & tmpl,
484
- const std::vector<llama_chat_msg> & chat,
582
+ const std::vector<common_chat_msg> & chat,
485
583
  bool add_ass);
486
584
 
487
585
  // Format single message, while taking into account the position of that message in chat history
488
- std::string llama_chat_format_single(const struct llama_model * model,
586
+ std::string common_chat_format_single(const struct llama_model * model,
489
587
  const std::string & tmpl,
490
- const std::vector<llama_chat_msg> & past_msg,
491
- const llama_chat_msg & new_msg,
588
+ const std::vector<common_chat_msg> & past_msg,
589
+ const common_chat_msg & new_msg,
492
590
  bool add_ass);
493
591
 
494
592
  // Returns an example of formatted chat
495
- std::string llama_chat_format_example(const struct llama_model * model,
593
+ std::string common_chat_format_example(const struct llama_model * model,
496
594
  const std::string & tmpl);
497
595
 
498
596
  //
@@ -500,31 +598,32 @@ std::string llama_chat_format_example(const struct llama_model * model,
500
598
  //
501
599
 
502
600
  // Dump the KV cache view with the number of sequences per cell.
503
- void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
601
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
504
602
 
505
603
  // Dump the KV cache view showing individual sequences in each cell (long output).
506
- void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
604
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
507
605
 
508
606
  //
509
607
  // Embedding utils
510
608
  //
511
609
 
512
- void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
610
+ // TODO: repace embd_norm with an enum
611
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
513
612
 
514
- float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
613
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
515
614
 
516
615
  //
517
616
  // Control vector utils
518
617
  //
519
618
 
520
- struct llama_control_vector_data {
619
+ struct common_control_vector_data {
521
620
  int n_embd;
522
621
 
523
622
  // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
524
623
  std::vector<float> data;
525
624
  };
526
625
 
527
- struct llama_control_vector_load_info {
626
+ struct common_control_vector_load_info {
528
627
  float strength;
529
628
 
530
629
  std::string fname;
@@ -532,7 +631,7 @@ struct llama_control_vector_load_info {
532
631
 
533
632
  // Load control vectors, scale each by strength, and add them together.
534
633
  // On error, returns {-1, empty}
535
- llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
634
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
536
635
 
537
636
  //
538
637
  // Split utils
@@ -541,15 +640,3 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
541
640
  static const char * const LLM_KV_SPLIT_NO = "split.no";
542
641
  static const char * const LLM_KV_SPLIT_COUNT = "split.count";
543
642
  static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
544
-
545
- //
546
- // YAML utils
547
- //
548
-
549
- void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
550
- void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
551
- void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
552
-
553
- void yaml_dump_non_result_info(
554
- FILE * stream, const gpt_params & params, const llama_context * lctx,
555
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
@@ -611,7 +611,7 @@ private:
611
611
  }
612
612
  return join_seq();
613
613
  };
614
- return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
614
+ return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
615
615
  }
616
616
 
617
617
  /*