@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. package/CMakeLists.txt +7 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/DetokenizeWorker.cpp +1 -1
  19. package/src/EmbeddingWorker.cpp +17 -7
  20. package/src/EmbeddingWorker.h +2 -1
  21. package/src/LlamaCompletionWorker.cpp +8 -8
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +89 -27
  24. package/src/LlamaContext.h +2 -0
  25. package/src/TokenizeWorker.cpp +1 -1
  26. package/src/common.hpp +4 -4
  27. package/src/llama.cpp/.github/workflows/build.yml +240 -168
  28. package/src/llama.cpp/.github/workflows/docker.yml +8 -8
  29. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  31. package/src/llama.cpp/CMakeLists.txt +14 -6
  32. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/cmake/common.cmake +33 -0
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  36. package/src/llama.cpp/common/CMakeLists.txt +6 -4
  37. package/src/llama.cpp/common/arg.cpp +986 -770
  38. package/src/llama.cpp/common/arg.h +22 -22
  39. package/src/llama.cpp/common/common.cpp +212 -351
  40. package/src/llama.cpp/common/common.h +204 -117
  41. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  42. package/src/llama.cpp/common/log.cpp +50 -50
  43. package/src/llama.cpp/common/log.h +18 -18
  44. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  45. package/src/llama.cpp/common/ngram-cache.h +19 -19
  46. package/src/llama.cpp/common/sampling.cpp +163 -121
  47. package/src/llama.cpp/common/sampling.h +41 -20
  48. package/src/llama.cpp/common/speculative.cpp +274 -0
  49. package/src/llama.cpp/common/speculative.h +28 -0
  50. package/src/llama.cpp/docs/build.md +134 -161
  51. package/src/llama.cpp/examples/CMakeLists.txt +33 -14
  52. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/batched/batched.cpp +19 -18
  54. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  55. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  56. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  57. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  58. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  60. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  61. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  63. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  64. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  65. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  66. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  67. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  69. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  70. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  71. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  72. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  73. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  75. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  77. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  78. package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
  79. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  80. package/src/llama.cpp/examples/infill/infill.cpp +41 -87
  81. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
  83. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
  84. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  85. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  86. package/src/llama.cpp/examples/llava/clip.cpp +263 -66
  87. package/src/llama.cpp/examples/llava/clip.h +8 -2
  88. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  89. package/src/llama.cpp/examples/llava/llava.cpp +83 -22
  90. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  91. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  92. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  94. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  95. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  96. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  97. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
  98. package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
  99. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  100. package/src/llama.cpp/examples/main/main.cpp +73 -114
  101. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  102. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  104. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  105. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  106. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  108. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  109. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  110. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  111. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  112. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  113. package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
  114. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  115. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  116. package/src/llama.cpp/examples/run/run.cpp +911 -0
  117. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  118. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
  119. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
  120. package/src/llama.cpp/examples/server/server.cpp +2073 -1339
  121. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  122. package/src/llama.cpp/examples/server/utils.hpp +354 -277
  123. package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
  124. package/src/llama.cpp/examples/simple/simple.cpp +130 -94
  125. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  126. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
  127. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  128. package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
  129. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  130. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  131. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  132. package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
  133. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  134. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  135. package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
  136. package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
  137. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  138. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  139. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  140. package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
  141. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  142. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  143. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  144. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  145. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  146. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  147. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  148. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  149. package/src/llama.cpp/ggml/include/ggml.h +159 -417
  150. package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
  151. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
  152. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
  153. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
  154. package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
  155. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
  156. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
  157. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
  158. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  159. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  160. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
  161. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  162. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  163. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  164. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  165. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  166. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  167. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  168. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  169. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  170. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
  171. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  172. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  173. package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  174. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  175. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  176. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  177. package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
  178. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  179. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  180. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  181. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
  182. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  183. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  184. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  185. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  186. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  187. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
  188. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
  189. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
  190. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
  192. package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
  193. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
  194. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
  195. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
  196. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  197. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
  198. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  199. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  200. package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
  201. package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
  202. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  203. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  204. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
  205. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
  206. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  207. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
  208. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
  209. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  210. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  211. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  212. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
  213. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  214. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  215. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  216. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
  217. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  218. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  219. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
  220. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
  221. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  222. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  223. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  224. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  225. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  226. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  227. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  228. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  229. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  230. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  231. package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
  232. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
  233. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
  234. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
  235. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  236. package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
  237. package/src/llama.cpp/include/llama-cpp.h +25 -0
  238. package/src/llama.cpp/include/llama.h +93 -52
  239. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  240. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  241. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  242. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  243. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  244. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  245. package/src/llama.cpp/src/CMakeLists.txt +4 -8
  246. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  247. package/src/llama.cpp/src/llama-grammar.h +2 -5
  248. package/src/llama.cpp/src/llama-sampling.cpp +779 -194
  249. package/src/llama.cpp/src/llama-sampling.h +21 -2
  250. package/src/llama.cpp/src/llama-vocab.cpp +55 -10
  251. package/src/llama.cpp/src/llama-vocab.h +35 -11
  252. package/src/llama.cpp/src/llama.cpp +4317 -2979
  253. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  254. package/src/llama.cpp/src/unicode.cpp +62 -51
  255. package/src/llama.cpp/src/unicode.h +9 -10
  256. package/src/llama.cpp/tests/CMakeLists.txt +48 -38
  257. package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
  258. package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
  259. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  260. package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
  261. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  262. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  263. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  264. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  265. package/src/llama.cpp/tests/test-log.cpp +2 -2
  266. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  267. package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
  268. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  269. package/src/llama.cpp/tests/test-rope.cpp +62 -20
  270. package/src/llama.cpp/tests/test-sampling.cpp +163 -138
  271. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  272. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  273. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  274. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  275. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  276. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  277. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  278. package/src/llama.cpp/common/train.cpp +0 -1515
  279. package/src/llama.cpp/common/train.h +0 -233
  280. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  281. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  282. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
  283. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
  284. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  285. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  286. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -12,6 +12,7 @@
12
12
 
13
13
  #include <algorithm>
14
14
  #include <cinttypes>
15
+ #include <climits>
15
16
  #include <cmath>
16
17
  #include <codecvt>
17
18
  #include <cstdarg>
@@ -23,10 +24,10 @@
23
24
  #include <regex>
24
25
  #include <sstream>
25
26
  #include <string>
27
+ #include <thread>
26
28
  #include <unordered_map>
27
29
  #include <unordered_set>
28
30
  #include <vector>
29
- #include <thread>
30
31
 
31
32
  #if defined(__APPLE__) && defined(__MACH__)
32
33
  #include <sys/types.h>
@@ -362,10 +363,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
362
363
  return true;
363
364
  }
364
365
 
365
- void gpt_init() {
366
+ void common_init() {
366
367
  llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
367
- if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
368
- gpt_log_add(gpt_log_main(), level, "%s", text);
368
+ if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
369
+ common_log_add(common_log_main(), level, "%s", text);
369
370
  }
370
371
  }, NULL);
371
372
 
@@ -378,7 +379,7 @@ void gpt_init() {
378
379
  LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
379
380
  }
380
381
 
381
- std::string gpt_params_get_system_info(const gpt_params & params) {
382
+ std::string common_params_get_system_info(const common_params & params) {
382
383
  std::ostringstream os;
383
384
 
384
385
  os << "system_info: n_threads = " << params.cpuparams.n_threads;
@@ -400,17 +401,19 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
400
401
  // String utils
401
402
  //
402
403
 
403
- std::vector<std::string> string_split(std::string input, char separator) {
404
- std::vector<std::string> parts;
405
- size_t separator_pos = input.find(separator);
406
- while (separator_pos != std::string::npos) {
407
- std::string part = input.substr(0, separator_pos);
408
- parts.emplace_back(part);
409
- input = input.substr(separator_pos + 1);
410
- separator_pos = input.find(separator);
411
- }
412
- parts.emplace_back(input);
413
- return parts;
404
+ std::string string_format(const char * fmt, ...) {
405
+ va_list ap;
406
+ va_list ap2;
407
+ va_start(ap, fmt);
408
+ va_copy(ap2, ap);
409
+ int size = vsnprintf(NULL, 0, fmt, ap);
410
+ GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
411
+ std::vector<char> buf(size + 1);
412
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
413
+ GGML_ASSERT(size2 == size);
414
+ va_end(ap2);
415
+ va_end(ap);
416
+ return std::string(buf.data(), size);
414
417
  }
415
418
 
416
419
  std::string string_strip(const std::string & str) {
@@ -493,7 +496,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
493
496
  first = false;
494
497
  }
495
498
 
496
- auto detokenized = llama_token_to_piece(ctx, token);
499
+ auto detokenized = common_token_to_piece(ctx, token);
497
500
 
498
501
  detokenized.erase(
499
502
  std::remove_if(
@@ -524,7 +527,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
524
527
  first = false;
525
528
  }
526
529
 
527
- auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
530
+ auto detokenized = common_token_to_piece(ctx, batch.token[i]);
528
531
 
529
532
  detokenized.erase(
530
533
  std::remove_if(
@@ -533,12 +536,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
533
536
  [](const unsigned char c) { return !std::isprint(c); }),
534
537
  detokenized.end());
535
538
 
536
- buf << "\n" << std::to_string(i)
537
- << ":token '" << detokenized << "'"
538
- << ":pos " << std::to_string(batch.pos[i])
539
- << ":n_seq_id " << std::to_string(batch.n_seq_id[i])
540
- << ":seq_id " << std::to_string(batch.seq_id[i][0])
541
- << ":logits " << std::to_string(batch.logits[i]);
539
+ buf << "\n" << std::to_string(i)
540
+ << ", token '" << detokenized << "'"
541
+ << ", pos " << std::to_string(batch.pos[i])
542
+ << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
543
+ << ", seq_id " << std::to_string(batch.seq_id[i][0])
544
+ << ", logits " << std::to_string(batch.logits[i]);
542
545
  }
543
546
 
544
547
  buf << " ]";
@@ -649,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) {
649
652
 
650
653
  std::u32string filename_utf32;
651
654
  try {
655
+ #if defined(__clang__)
656
+ // disable C++17 deprecation warning for std::codecvt_utf8
657
+ # pragma clang diagnostic push
658
+ # pragma clang diagnostic ignored "-Wdeprecated-declarations"
659
+ #endif
652
660
  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
661
+
662
+ #if defined(__clang__)
663
+ # pragma clang diagnostic pop
664
+ #endif
665
+
653
666
  filename_utf32 = converter.from_bytes(filename);
654
667
 
655
668
  // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
@@ -819,16 +832,16 @@ std::string fs_get_cache_file(const std::string & filename) {
819
832
  //
820
833
  // Model utils
821
834
  //
822
- struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
823
- llama_init_result iparams;
824
- auto mparams = llama_model_params_from_gpt_params(params);
835
+ struct common_init_result common_init_from_params(common_params & params) {
836
+ common_init_result iparams;
837
+ auto mparams = common_model_params_to_llama(params);
825
838
 
826
839
  llama_model * model = nullptr;
827
840
 
828
841
  if (!params.hf_repo.empty() && !params.hf_file.empty()) {
829
- model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
842
+ model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
830
843
  } else if (!params.model_url.empty()) {
831
- model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
844
+ model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
832
845
  } else {
833
846
  model = llama_load_model_from_file(params.model.c_str(), mparams);
834
847
  }
@@ -863,7 +876,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
863
876
  }
864
877
  }
865
878
 
866
- auto cparams = llama_context_params_from_gpt_params(params);
879
+ auto cparams = common_context_params_to_llama(params);
867
880
 
868
881
  llama_context * lctx = llama_new_context_with_model(model, cparams);
869
882
  if (lctx == NULL) {
@@ -872,11 +885,17 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
872
885
  return iparams;
873
886
  }
874
887
 
888
+ if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
889
+ LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
890
+ llama_free_model(model);
891
+ return iparams;
892
+ }
893
+
875
894
  if (!params.control_vectors.empty()) {
876
895
  if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
877
896
  if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
878
897
 
879
- const auto cvec = llama_control_vector_load(params.control_vectors);
898
+ const auto cvec = common_control_vector_load(params.control_vectors);
880
899
  if (cvec.n_embd == -1) {
881
900
  llama_free(lctx);
882
901
  llama_free_model(model);
@@ -900,7 +919,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
900
919
 
901
920
  // load and optionally apply lora adapters
902
921
  for (auto & la : params.lora_adapters) {
903
- llama_lora_adapter_container loaded_la;
922
+ common_lora_adapter_container loaded_la;
904
923
  loaded_la.path = la.path;
905
924
  loaded_la.scale = la.scale;
906
925
  loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
@@ -913,12 +932,31 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
913
932
  iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
914
933
  }
915
934
  if (!params.lora_init_without_apply) {
916
- llama_lora_adapters_apply(lctx, iparams.lora_adapters);
935
+ common_lora_adapters_apply(lctx, iparams.lora_adapters);
917
936
  }
918
937
 
919
- if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
938
+ if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
920
939
  LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
921
- params.sparams.ignore_eos = false;
940
+ params.sampling.ignore_eos = false;
941
+ }
942
+
943
+ if (params.sampling.ignore_eos) {
944
+ for (llama_token i = 0; i < llama_n_vocab(model); i++) {
945
+ if (llama_token_is_eog(model, i)) {
946
+ LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
947
+ params.sampling.logit_bias.push_back({i, -INFINITY});
948
+ }
949
+ }
950
+ }
951
+
952
+ if (params.sampling.penalty_last_n == -1) {
953
+ LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
954
+ params.sampling.penalty_last_n = llama_n_ctx(lctx);
955
+ }
956
+
957
+ if (params.sampling.dry_penalty_last_n == -1) {
958
+ LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
959
+ params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
922
960
  }
923
961
 
924
962
  if (params.warmup) {
@@ -939,7 +977,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
939
977
  }
940
978
 
941
979
  if (llama_model_has_encoder(model)) {
942
- llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
980
+ llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
943
981
  llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
944
982
  if (decoder_start_token_id == -1) {
945
983
  decoder_start_token_id = bos;
@@ -948,7 +986,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
948
986
  tmp.push_back(decoder_start_token_id);
949
987
  }
950
988
  if (llama_model_has_decoder(model)) {
951
- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
989
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
952
990
  }
953
991
  llama_kv_cache_clear(lctx);
954
992
  llama_synchronize(lctx);
@@ -961,7 +999,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
961
999
  return iparams;
962
1000
  }
963
1001
 
964
- void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
1002
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
965
1003
  llama_lora_adapter_clear(ctx);
966
1004
  for (auto & la : lora_adapters) {
967
1005
  if (la.scale != 0.0f) {
@@ -970,9 +1008,12 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
970
1008
  }
971
1009
  }
972
1010
 
973
- struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
1011
+ struct llama_model_params common_model_params_to_llama(common_params & params) {
974
1012
  auto mparams = llama_model_default_params();
975
1013
 
1014
+ if (!params.devices.empty()) {
1015
+ mparams.devices = params.devices.data();
1016
+ }
976
1017
  if (params.n_gpu_layers != -1) {
977
1018
  mparams.n_gpu_layers = params.n_gpu_layers;
978
1019
  }
@@ -993,36 +1034,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
993
1034
  return mparams;
994
1035
  }
995
1036
 
996
- static ggml_type kv_cache_type_from_str(const std::string & s) {
997
- if (s == "f32") {
998
- return GGML_TYPE_F32;
999
- }
1000
- if (s == "f16") {
1001
- return GGML_TYPE_F16;
1002
- }
1003
- if (s == "q8_0") {
1004
- return GGML_TYPE_Q8_0;
1005
- }
1006
- if (s == "q4_0") {
1007
- return GGML_TYPE_Q4_0;
1008
- }
1009
- if (s == "q4_1") {
1010
- return GGML_TYPE_Q4_1;
1011
- }
1012
- if (s == "iq4_nl") {
1013
- return GGML_TYPE_IQ4_NL;
1014
- }
1015
- if (s == "q5_0") {
1016
- return GGML_TYPE_Q5_0;
1017
- }
1018
- if (s == "q5_1") {
1019
- return GGML_TYPE_Q5_1;
1020
- }
1021
-
1022
- throw std::runtime_error("Invalid cache type: " + s);
1023
- }
1024
-
1025
- struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
1037
+ struct llama_context_params common_context_params_to_llama(const common_params & params) {
1026
1038
  auto cparams = llama_context_default_params();
1027
1039
 
1028
1040
  cparams.n_ctx = params.n_ctx;
@@ -1031,7 +1043,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
1031
1043
  cparams.n_ubatch = params.n_ubatch;
1032
1044
  cparams.n_threads = params.cpuparams.n_threads;
1033
1045
  cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1034
- params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1046
+ params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1035
1047
  cparams.logits_all = params.logits_all;
1036
1048
  cparams.embeddings = params.embedding;
1037
1049
  cparams.rope_scaling_type = params.rope_scaling_type;
@@ -1056,8 +1068,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
1056
1068
  cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1057
1069
  }
1058
1070
 
1059
- cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
1060
- cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
1071
+ cparams.type_k = params.cache_type_k;
1072
+ cparams.type_v = params.cache_type_v;
1061
1073
 
1062
1074
  return cparams;
1063
1075
  }
@@ -1083,13 +1095,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
1083
1095
  #define CURL_MAX_RETRY 3
1084
1096
  #define CURL_RETRY_DELAY_SECONDS 2
1085
1097
 
1086
-
1087
- static bool starts_with(const std::string & str, const std::string & prefix) {
1088
- // While we wait for C++20's std::string::starts_with...
1089
- return str.rfind(prefix, 0) == 0;
1090
- }
1091
-
1092
- static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
1098
+ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
1093
1099
  int remaining_attempts = max_attempts;
1094
1100
 
1095
1101
  while (remaining_attempts > 0) {
@@ -1112,8 +1118,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
1112
1118
  return false;
1113
1119
  }
1114
1120
 
1115
- static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1116
-
1121
+ static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1117
1122
  // Initialize libcurl
1118
1123
  std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
1119
1124
  if (!curl) {
@@ -1182,15 +1187,17 @@ static bool llama_download_file(const std::string & url, const std::string & pat
1182
1187
  }
1183
1188
 
1184
1189
  // Send a HEAD request to retrieve the etag and last-modified headers
1185
- struct llama_load_model_from_url_headers {
1190
+ struct common_load_model_from_url_headers {
1186
1191
  std::string etag;
1187
1192
  std::string last_modified;
1188
1193
  };
1189
- llama_load_model_from_url_headers headers;
1194
+
1195
+ common_load_model_from_url_headers headers;
1196
+
1190
1197
  {
1191
1198
  typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
1192
1199
  auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1193
- llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
1200
+ common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
1194
1201
 
1195
1202
  static std::regex header_regex("([^:]+): (.*)\r\n");
1196
1203
  static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1326,18 +1333,18 @@ static bool llama_download_file(const std::string & url, const std::string & pat
1326
1333
  return true;
1327
1334
  }
1328
1335
 
1329
- struct llama_model * llama_load_model_from_url(
1330
- const char * model_url,
1331
- const char * path_model,
1332
- const char * hf_token,
1336
+ struct llama_model * common_load_model_from_url(
1337
+ const std::string & model_url,
1338
+ const std::string & local_path,
1339
+ const std::string & hf_token,
1333
1340
  const struct llama_model_params & params) {
1334
1341
  // Basic validation of the model_url
1335
- if (!model_url || strlen(model_url) == 0) {
1342
+ if (model_url.empty()) {
1336
1343
  LOG_ERR("%s: invalid model_url\n", __func__);
1337
1344
  return NULL;
1338
1345
  }
1339
1346
 
1340
- if (!llama_download_file(model_url, path_model, hf_token)) {
1347
+ if (!common_download_file(model_url, local_path, hf_token)) {
1341
1348
  return NULL;
1342
1349
  }
1343
1350
 
@@ -1348,9 +1355,9 @@ struct llama_model * llama_load_model_from_url(
1348
1355
  /*.no_alloc = */ true,
1349
1356
  /*.ctx = */ NULL,
1350
1357
  };
1351
- auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
1358
+ auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
1352
1359
  if (!ctx_gguf) {
1353
- LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
1360
+ LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
1354
1361
  return NULL;
1355
1362
  }
1356
1363
 
@@ -1369,13 +1376,13 @@ struct llama_model * llama_load_model_from_url(
1369
1376
  // Verify the first split file format
1370
1377
  // and extract split URL and PATH prefixes
1371
1378
  {
1372
- if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
1373
- LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
1379
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
1380
+ LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
1374
1381
  return NULL;
1375
1382
  }
1376
1383
 
1377
- if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
1378
- LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
1384
+ if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
1385
+ LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
1379
1386
  return NULL;
1380
1387
  }
1381
1388
  }
@@ -1390,7 +1397,7 @@ struct llama_model * llama_load_model_from_url(
1390
1397
  char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1391
1398
  llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
1392
1399
 
1393
- return llama_download_file(split_url, split_path, hf_token);
1400
+ return common_download_file(split_url, split_path, hf_token);
1394
1401
  }, idx));
1395
1402
  }
1396
1403
 
@@ -1402,14 +1409,14 @@ struct llama_model * llama_load_model_from_url(
1402
1409
  }
1403
1410
  }
1404
1411
 
1405
- return llama_load_model_from_file(path_model, params);
1412
+ return llama_load_model_from_file(local_path.c_str(), params);
1406
1413
  }
1407
1414
 
1408
- struct llama_model * llama_load_model_from_hf(
1409
- const char * repo,
1410
- const char * model,
1411
- const char * path_model,
1412
- const char * hf_token,
1415
+ struct llama_model * common_load_model_from_hf(
1416
+ const std::string & repo,
1417
+ const std::string & remote_path,
1418
+ const std::string & local_path,
1419
+ const std::string & hf_token,
1413
1420
  const struct llama_model_params & params) {
1414
1421
  // construct hugging face model url:
1415
1422
  //
@@ -1423,27 +1430,27 @@ struct llama_model * llama_load_model_from_hf(
1423
1430
  std::string model_url = "https://huggingface.co/";
1424
1431
  model_url += repo;
1425
1432
  model_url += "/resolve/main/";
1426
- model_url += model;
1433
+ model_url += remote_path;
1427
1434
 
1428
- return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
1435
+ return common_load_model_from_url(model_url, local_path, hf_token, params);
1429
1436
  }
1430
1437
 
1431
1438
  #else
1432
1439
 
1433
- struct llama_model * llama_load_model_from_url(
1434
- const char * /*model_url*/,
1435
- const char * /*path_model*/,
1436
- const char * /*hf_token*/,
1440
+ struct llama_model * common_load_model_from_url(
1441
+ const std::string & /*model_url*/,
1442
+ const std::string & /*local_path*/,
1443
+ const std::string & /*hf_token*/,
1437
1444
  const struct llama_model_params & /*params*/) {
1438
1445
  LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
1439
1446
  return nullptr;
1440
1447
  }
1441
1448
 
1442
- struct llama_model * llama_load_model_from_hf(
1443
- const char * /*repo*/,
1444
- const char * /*model*/,
1445
- const char * /*path_model*/,
1446
- const char * /*hf_token*/,
1449
+ struct llama_model * common_load_model_from_hf(
1450
+ const std::string & /*repo*/,
1451
+ const std::string & /*remote_path*/,
1452
+ const std::string & /*local_path*/,
1453
+ const std::string & /*hf_token*/,
1447
1454
  const struct llama_model_params & /*params*/) {
1448
1455
  LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1449
1456
  return nullptr;
@@ -1455,11 +1462,11 @@ struct llama_model * llama_load_model_from_hf(
1455
1462
  // Batch utils
1456
1463
  //
1457
1464
 
1458
- void llama_batch_clear(struct llama_batch & batch) {
1465
+ void common_batch_clear(struct llama_batch & batch) {
1459
1466
  batch.n_tokens = 0;
1460
1467
  }
1461
1468
 
1462
- void llama_batch_add(
1469
+ void common_batch_add(
1463
1470
  struct llama_batch & batch,
1464
1471
  llama_token id,
1465
1472
  llama_pos pos,
@@ -1478,19 +1485,79 @@ void llama_batch_add(
1478
1485
  batch.n_tokens++;
1479
1486
  }
1480
1487
 
1488
+ //
1489
+ // Token utils
1490
+ //
1491
+
1492
+ size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
1493
+ size_t i;
1494
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
1495
+
1496
+ return i;
1497
+ }
1498
+
1499
+ size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
1500
+ // check for empty sequences
1501
+ if (a.empty() || b.empty()) {
1502
+ return 0;
1503
+ }
1504
+
1505
+ // get the lengths of the input sequences
1506
+ size_t a_len = a.size();
1507
+ size_t b_len = b.size();
1508
+
1509
+ // initialize the maximum length of the longest common subsequence (LCS)
1510
+ size_t max_length = 0;
1511
+
1512
+ // use two rows instead of a 2D matrix to optimize space
1513
+ std::vector<size_t> prev_row(b_len + 1, 0);
1514
+ std::vector<size_t> curr_row(b_len + 1, 0);
1515
+
1516
+ // iterate through the elements of a
1517
+ for (size_t i = 1; i <= a_len; i++) {
1518
+ // iterate through the elements of b
1519
+ for (size_t j = 1; j <= b_len; j++) {
1520
+ // if elements at the current positions match
1521
+ if (a[i - 1] == b[j - 1]) {
1522
+ // if it's the first element of either sequences, set LCS length to 1
1523
+ if (i == 1 || j == 1) {
1524
+ curr_row[j] = 1;
1525
+ } else {
1526
+ // increment LCS length by 1 compared to the previous element
1527
+ curr_row[j] = prev_row[j - 1] + 1;
1528
+ }
1529
+
1530
+ // update max_length if necessary
1531
+ if (curr_row[j] > max_length) {
1532
+ max_length = curr_row[j];
1533
+ }
1534
+ } else {
1535
+ // reset LCS length if elements don't match
1536
+ curr_row[j] = 0;
1537
+ }
1538
+ }
1539
+
1540
+ // update the previous row for the next iteration
1541
+ prev_row = curr_row;
1542
+ }
1543
+
1544
+ // return the maximum length of the LCS
1545
+ return max_length;
1546
+ }
1547
+
1481
1548
  //
1482
1549
  // Vocab utils
1483
1550
  //
1484
1551
 
1485
- std::vector<llama_token> llama_tokenize(
1552
+ std::vector<llama_token> common_tokenize(
1486
1553
  const struct llama_context * ctx,
1487
1554
  const std::string & text,
1488
1555
  bool add_special,
1489
1556
  bool parse_special) {
1490
- return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1557
+ return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1491
1558
  }
1492
1559
 
1493
- std::vector<llama_token> llama_tokenize(
1560
+ std::vector<llama_token> common_tokenize(
1494
1561
  const struct llama_model * model,
1495
1562
  const std::string & text,
1496
1563
  bool add_special,
@@ -1509,7 +1576,7 @@ std::vector<llama_token> llama_tokenize(
1509
1576
  return result;
1510
1577
  }
1511
1578
 
1512
- std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1579
+ std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1513
1580
  std::string piece;
1514
1581
  piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1515
1582
  const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
@@ -1525,7 +1592,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
1525
1592
  return piece;
1526
1593
  }
1527
1594
 
1528
- std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1595
+ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1529
1596
  std::string text;
1530
1597
  text.resize(std::max(text.capacity(), tokens.size()));
1531
1598
  int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@@ -1545,15 +1612,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
1545
1612
  // Chat template utils
1546
1613
  //
1547
1614
 
1548
- bool llama_chat_verify_template(const std::string & tmpl) {
1615
+ bool common_chat_verify_template(const std::string & tmpl) {
1549
1616
  llama_chat_message chat[] = {{"user", "test"}};
1550
1617
  int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
1551
1618
  return res >= 0;
1552
1619
  }
1553
1620
 
1554
- std::string llama_chat_apply_template(const struct llama_model * model,
1621
+ std::string common_chat_apply_template(const struct llama_model * model,
1555
1622
  const std::string & tmpl,
1556
- const std::vector<llama_chat_msg> & msgs,
1623
+ const std::vector<common_chat_msg> & msgs,
1557
1624
  bool add_ass) {
1558
1625
  int alloc_size = 0;
1559
1626
  bool fallback = false; // indicate if we must fallback to default chatml
@@ -1595,42 +1662,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
1595
1662
  return formatted_chat;
1596
1663
  }
1597
1664
 
1598
- std::string llama_chat_format_single(const struct llama_model * model,
1665
+ std::string common_chat_format_single(const struct llama_model * model,
1599
1666
  const std::string & tmpl,
1600
- const std::vector<llama_chat_msg> & past_msg,
1601
- const llama_chat_msg & new_msg,
1667
+ const std::vector<common_chat_msg> & past_msg,
1668
+ const common_chat_msg & new_msg,
1602
1669
  bool add_ass) {
1603
1670
  std::ostringstream ss;
1604
- auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
1605
- std::vector<llama_chat_msg> chat_new(past_msg);
1671
+ auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
1672
+ std::vector<common_chat_msg> chat_new(past_msg);
1606
1673
  // if the past_msg ends with a newline, we must preserve it in the formatted version
1607
1674
  if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
1608
1675
  ss << "\n";
1609
1676
  };
1610
1677
  // format chat with new_msg
1611
1678
  chat_new.push_back(new_msg);
1612
- auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
1679
+ auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
1613
1680
  // get the diff part
1614
1681
  ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
1615
1682
  return ss.str();
1616
1683
  }
1617
1684
 
1618
- std::string llama_chat_format_example(const struct llama_model * model,
1685
+ std::string common_chat_format_example(const struct llama_model * model,
1619
1686
  const std::string & tmpl) {
1620
- std::vector<llama_chat_msg> msgs = {
1687
+ std::vector<common_chat_msg> msgs = {
1621
1688
  {"system", "You are a helpful assistant"},
1622
1689
  {"user", "Hello"},
1623
1690
  {"assistant", "Hi there"},
1624
1691
  {"user", "How are you?"},
1625
1692
  };
1626
- return llama_chat_apply_template(model, tmpl, msgs, true);
1693
+ return common_chat_apply_template(model, tmpl, msgs, true);
1627
1694
  }
1628
1695
 
1629
1696
  //
1630
1697
  // KV cache utils
1631
1698
  //
1632
1699
 
1633
- void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1700
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1634
1701
  static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1635
1702
 
1636
1703
  printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
@@ -1653,7 +1720,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1653
1720
  printf("\n=== Done dumping\n");
1654
1721
  }
1655
1722
 
1656
- void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1723
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1657
1724
  static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1658
1725
 
1659
1726
  printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
@@ -1705,7 +1772,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
1705
1772
  // Embedding utils
1706
1773
  //
1707
1774
 
1708
- void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1775
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1709
1776
  double sum = 0.0;
1710
1777
 
1711
1778
  switch (embd_norm) {
@@ -1714,7 +1781,9 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
1714
1781
  break;
1715
1782
  case 0: // max absolute
1716
1783
  for (int i = 0; i < n; i++) {
1717
- if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
1784
+ if (sum < std::abs(inp[i])) {
1785
+ sum = std::abs(inp[i]);
1786
+ }
1718
1787
  }
1719
1788
  sum /= 32760.0; // make an int16 range
1720
1789
  break;
@@ -1739,7 +1808,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
1739
1808
  }
1740
1809
  }
1741
1810
 
1742
- float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1811
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1743
1812
  double sum = 0.0;
1744
1813
  double sum1 = 0.0;
1745
1814
  double sum2 = 0.0;
@@ -1765,8 +1834,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
1765
1834
  // Control vector utils
1766
1835
  //
1767
1836
 
1768
- static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
1769
- llama_control_vector_data result = { -1, {} };
1837
+ static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
1838
+ common_control_vector_data result = { -1, {} };
1770
1839
 
1771
1840
  ggml_context * ctx = nullptr;
1772
1841
  struct gguf_init_params meta_gguf_params = {
@@ -1850,11 +1919,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
1850
1919
  return result;
1851
1920
  }
1852
1921
 
1853
- llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
1854
- llama_control_vector_data result = { -1, {} };
1922
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
1923
+ common_control_vector_data result = { -1, {} };
1855
1924
 
1856
1925
  for (const auto & info : load_infos) {
1857
- auto cur = llama_control_vector_load_one(info);
1926
+ auto cur = common_control_vector_load_one(info);
1858
1927
 
1859
1928
  if (cur.n_embd == -1) {
1860
1929
  result.n_embd = -1;
@@ -1884,211 +1953,3 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
1884
1953
  return result;
1885
1954
  }
1886
1955
 
1887
- //
1888
- // YAML utils
1889
- //
1890
-
1891
- void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
1892
- if (data.empty()) {
1893
- fprintf(stream, "%s:\n", prop_name);
1894
- return;
1895
- }
1896
-
1897
- fprintf(stream, "%s: [", prop_name);
1898
- for (size_t i = 0; i < data.size() - 1; ++i) {
1899
- fprintf(stream, "%e, ", data[i]);
1900
- }
1901
- fprintf(stream, "%e]\n", data.back());
1902
- }
1903
-
1904
- void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
1905
- if (data.empty()) {
1906
- fprintf(stream, "%s:\n", prop_name);
1907
- return;
1908
- }
1909
-
1910
- fprintf(stream, "%s: [", prop_name);
1911
- for (size_t i = 0; i < data.size() - 1; ++i) {
1912
- fprintf(stream, "%d, ", data[i]);
1913
- }
1914
- fprintf(stream, "%d]\n", data.back());
1915
- }
1916
-
1917
- void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
1918
- std::string data_str(data == NULL ? "" : data);
1919
-
1920
- if (data_str.empty()) {
1921
- fprintf(stream, "%s:\n", prop_name);
1922
- return;
1923
- }
1924
-
1925
- size_t pos_start = 0;
1926
- size_t pos_found = 0;
1927
-
1928
- if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
1929
- data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
1930
- data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
1931
- data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
1932
- data_str = "\"" + data_str + "\"";
1933
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
1934
- return;
1935
- }
1936
-
1937
- if (data_str.find('\n') == std::string::npos) {
1938
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
1939
- return;
1940
- }
1941
-
1942
- fprintf(stream, "%s: |\n", prop_name);
1943
- while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
1944
- fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
1945
- pos_start = pos_found + 1;
1946
- }
1947
- }
1948
-
1949
- void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
1950
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
1951
- const auto & sparams = params.sparams;
1952
-
1953
- fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
1954
- fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
1955
- fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
1956
- fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
1957
- fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
1958
- fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
1959
- fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
1960
- fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
1961
- fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
1962
- fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
1963
- fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
1964
- fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
1965
- fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
1966
- fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
1967
- fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
1968
- fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
1969
- fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
1970
- fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
1971
- fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
1972
- fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
1973
- fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
1974
- fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
1975
- fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
1976
- fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
1977
-
1978
- #ifdef NDEBUG
1979
- fprintf(stream, "debug: false\n");
1980
- #else
1981
- fprintf(stream, "debug: true\n");
1982
- #endif // NDEBUG
1983
-
1984
- fprintf(stream, "model_desc: %s\n", model_desc);
1985
- fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
1986
-
1987
- #ifdef __OPTIMIZE__
1988
- fprintf(stream, "optimize: true\n");
1989
- #else
1990
- fprintf(stream, "optimize: false\n");
1991
- #endif // __OPTIMIZE__
1992
-
1993
- fprintf(stream, "time: %s\n", timestamp.c_str());
1994
-
1995
- fprintf(stream, "\n");
1996
- fprintf(stream, "###############\n");
1997
- fprintf(stream, "# User Inputs #\n");
1998
- fprintf(stream, "###############\n");
1999
- fprintf(stream, "\n");
2000
-
2001
- fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
2002
- fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
2003
- fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
2004
- fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
2005
- fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
2006
- fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
2007
- fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
2008
- fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
2009
- yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
2010
- fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
2011
- fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
2012
- fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
2013
- fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
2014
-
2015
- yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
2016
- fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
2017
- yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
2018
- fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2019
- fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2020
- fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2021
- fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
2022
-
2023
- fprintf(stream, "logit_bias:\n");
2024
- for (const auto & logit_bias : sparams.logit_bias) {
2025
- fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
2026
- }
2027
-
2028
- fprintf(stream, "lora:\n");
2029
- for (auto & la : params.lora_adapters) {
2030
- if (la.scale == 1.0f) {
2031
- fprintf(stream, " - %s\n", la.path.c_str());
2032
- }
2033
- }
2034
- fprintf(stream, "lora_scaled:\n");
2035
- for (auto & la : params.lora_adapters) {
2036
- if (la.scale != 1.0f) {
2037
- fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
2038
- }
2039
- }
2040
- fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
2041
- fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
2042
- fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
2043
- fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
2044
- fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
2045
- fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
2046
- fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
2047
- fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
2048
- fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
2049
- fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
2050
- fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
2051
- fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
2052
- fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
2053
- fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
2054
- fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
2055
- fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
2056
- fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
2057
- fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
2058
- yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
2059
- fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
2060
- fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
2061
- fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
2062
- yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
2063
- fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
2064
-
2065
- fprintf(stream, "reverse_prompt:\n");
2066
- for (std::string ap : params.antiprompt) {
2067
- size_t pos = 0;
2068
- while ((pos = ap.find('\n', pos)) != std::string::npos) {
2069
- ap.replace(pos, 1, "\\n");
2070
- pos += 1;
2071
- }
2072
-
2073
- fprintf(stream, " - %s\n", ap.c_str());
2074
- }
2075
-
2076
- fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
2077
- fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
2078
- fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
2079
- fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
2080
- fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
2081
- fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
2082
-
2083
- const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
2084
- yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
2085
-
2086
- fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
2087
- fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
2088
- fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
2089
- fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
2090
- fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2091
- fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
2092
- fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
2093
- fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
2094
- }