@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -4,18 +4,9 @@
4
4
 
5
5
  #include "llama.h"
6
6
 
7
- #include "sampling.h"
8
-
9
- #define LOG_NO_FILE_LINE_FUNCTION
10
- #include "log.h"
11
-
12
- #include <cmath>
13
7
  #include <string>
14
8
  #include <vector>
15
- #include <random>
16
- #include <thread>
17
- #include <unordered_map>
18
- #include <tuple>
9
+ #include <sstream>
19
10
 
20
11
  #ifdef _WIN32
21
12
  #define DIRECTORY_SEPARATOR '\\'
@@ -33,40 +24,138 @@
33
24
 
34
25
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
35
26
 
27
+ struct common_lora_adapter_info {
28
+ std::string path;
29
+ float scale;
30
+ };
31
+
32
+ struct common_lora_adapter_container : common_lora_adapter_info {
33
+ struct llama_lora_adapter * adapter;
34
+ };
35
+
36
36
  // build info
37
37
  extern int LLAMA_BUILD_NUMBER;
38
38
  extern char const * LLAMA_COMMIT;
39
39
  extern char const * LLAMA_COMPILER;
40
40
  extern char const * LLAMA_BUILD_TARGET;
41
41
 
42
- struct llama_control_vector_load_info;
42
+ struct common_control_vector_load_info;
43
43
 
44
44
  //
45
45
  // CPU utils
46
46
  //
47
47
 
48
+ struct cpu_params {
49
+ int n_threads = -1;
50
+ bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
51
+ bool mask_valid = false; // Default: any CPU
52
+ enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
53
+ bool strict_cpu = false; // Use strict CPU placement
54
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
55
+ };
56
+
48
57
  int32_t cpu_get_num_physical_cores();
49
58
  int32_t cpu_get_num_math();
50
59
 
51
60
  //
52
- // CLI argument parsing
61
+ // Common params
53
62
  //
54
63
 
64
+ enum llama_example {
65
+ LLAMA_EXAMPLE_COMMON,
66
+ LLAMA_EXAMPLE_SPECULATIVE,
67
+ LLAMA_EXAMPLE_MAIN,
68
+ LLAMA_EXAMPLE_INFILL,
69
+ LLAMA_EXAMPLE_EMBEDDING,
70
+ LLAMA_EXAMPLE_PERPLEXITY,
71
+ LLAMA_EXAMPLE_RETRIEVAL,
72
+ LLAMA_EXAMPLE_PASSKEY,
73
+ LLAMA_EXAMPLE_IMATRIX,
74
+ LLAMA_EXAMPLE_BENCH,
75
+ LLAMA_EXAMPLE_SERVER,
76
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
77
+ LLAMA_EXAMPLE_EXPORT_LORA,
78
+ LLAMA_EXAMPLE_LLAVA,
79
+ LLAMA_EXAMPLE_LOOKUP,
80
+ LLAMA_EXAMPLE_PARALLEL,
81
+
82
+ LLAMA_EXAMPLE_COUNT,
83
+ };
84
+
85
+ enum common_sampler_type {
86
+ COMMON_SAMPLER_TYPE_NONE = 0,
87
+ COMMON_SAMPLER_TYPE_DRY = 1,
88
+ COMMON_SAMPLER_TYPE_TOP_K = 2,
89
+ COMMON_SAMPLER_TYPE_TOP_P = 3,
90
+ COMMON_SAMPLER_TYPE_MIN_P = 4,
91
+ //COMMON_SAMPLER_TYPE_TFS_Z = 5,
92
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
93
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
94
+ COMMON_SAMPLER_TYPE_XTC = 8,
95
+ COMMON_SAMPLER_TYPE_INFILL = 9,
96
+ };
97
+
55
98
  // dimensionality reduction methods, used by cvector-generator
56
99
  enum dimre_method {
57
100
  DIMRE_METHOD_PCA,
58
101
  DIMRE_METHOD_MEAN,
59
102
  };
60
103
 
61
- struct gpt_params {
62
- uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
104
+ // sampler parameters
105
+ struct common_sampler_params {
106
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
107
+
108
+ int32_t n_prev = 64; // number of previous tokens to remember
109
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
110
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
111
+ int32_t top_k = 40; // <= 0 to use vocab size
112
+ float top_p = 0.95f; // 1.0 = disabled
113
+ float min_p = 0.05f; // 0.0 = disabled
114
+ float xtc_probability = 0.00f; // 0.0 = disabled
115
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
116
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
117
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
118
+ float dynatemp_range = 0.00f; // 0.0 = disabled
119
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
120
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
121
+ float penalty_repeat = 1.00f; // 1.0 = disabled
122
+ float penalty_freq = 0.00f; // 0.0 = disabled
123
+ float penalty_present = 0.00f; // 0.0 = disabled
124
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
125
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
126
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
127
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
128
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
129
+ float mirostat_tau = 5.00f; // target entropy
130
+ float mirostat_eta = 0.10f; // learning rate
131
+ bool penalize_nl = false; // consider newlines as a repeatable token
132
+ bool ignore_eos = false;
133
+ bool no_perf = false; // disable performance metrics
134
+
135
+ std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
136
+
137
+
138
+ std::vector<enum common_sampler_type> samplers = {
139
+ COMMON_SAMPLER_TYPE_DRY,
140
+ COMMON_SAMPLER_TYPE_TOP_K,
141
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
142
+ COMMON_SAMPLER_TYPE_TOP_P,
143
+ COMMON_SAMPLER_TYPE_MIN_P,
144
+ COMMON_SAMPLER_TYPE_XTC,
145
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
146
+ };
147
+
148
+ std::string grammar; // optional BNF-like grammar to constrain sampling
149
+
150
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
151
+
152
+ // print the parameters into a string
153
+ std::string print() const;
154
+ };
63
155
 
64
- int32_t n_threads = cpu_get_num_math();
65
- int32_t n_threads_draft = -1;
66
- int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
67
- int32_t n_threads_batch_draft = -1;
156
+ struct common_params {
68
157
  int32_t n_predict = -1; // new tokens to predict
69
- int32_t n_ctx = 0; // context size
158
+ int32_t n_ctx = 4096; // context size
70
159
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
71
160
  int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
72
161
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -89,7 +178,12 @@ struct gpt_params {
89
178
  float yarn_beta_fast = 32.0f; // YaRN low correction dim
90
179
  float yarn_beta_slow = 1.0f; // YaRN high correction dim
91
180
  int32_t yarn_orig_ctx = 0; // YaRN original context length
92
- float defrag_thold = -1.0f; // KV cache defragmentation threshold
181
+ float defrag_thold = 0.1f; // KV cache defragmentation threshold
182
+
183
+ struct cpu_params cpuparams;
184
+ struct cpu_params cpuparams_batch;
185
+ struct cpu_params draft_cpuparams;
186
+ struct cpu_params draft_cpuparams_batch;
93
187
 
94
188
  ggml_backend_sched_eval_callback cb_eval = nullptr;
95
189
  void * cb_eval_user_data = nullptr;
@@ -101,35 +195,33 @@ struct gpt_params {
101
195
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
102
196
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
103
197
 
104
- // // sampling parameters
105
- struct llama_sampling_params sparams;
106
-
107
- std::string model = ""; // model path
108
- std::string model_draft = ""; // draft model for speculative decoding
109
- std::string model_alias = "unknown"; // model alias
110
- std::string model_url = ""; // model url to download
111
- std::string hf_token = ""; // HF token
112
- std::string hf_repo = ""; // HF repo
113
- std::string hf_file = ""; // HF file
114
- std::string prompt = "";
115
- std::string prompt_file = ""; // store the external prompt file name
116
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
117
- std::string input_prefix = ""; // string to prefix user inputs with
118
- std::string input_suffix = ""; // string to suffix user inputs with
119
- std::string logdir = ""; // directory in which to save YAML log files
120
- std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
121
- std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
122
- std::string logits_file = ""; // file for saving *all* logits
123
- std::string rpc_servers = ""; // comma separated list of RPC servers
198
+ struct common_sampler_params sparams;
199
+
200
+ std::string model = ""; // model path // NOLINT
201
+ std::string model_draft = ""; // draft model for speculative decoding // NOLINT
202
+ std::string model_alias = "unknown"; // model alias // NOLINT
203
+ std::string model_url = ""; // model url to download // NOLINT
204
+ std::string hf_token = ""; // HF token // NOLINT
205
+ std::string hf_repo = ""; // HF repo // NOLINT
206
+ std::string hf_file = ""; // HF file // NOLINT
207
+ std::string prompt = ""; // NOLINT
208
+ std::string prompt_file = ""; // store the external prompt file name // NOLINT
209
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
210
+ std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
211
+ std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
212
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
213
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
214
+ std::string logits_file = ""; // file for saving *all* logits // NOLINT
215
+ std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
124
216
 
125
217
  std::vector<std::string> in_files; // all input files
126
218
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
127
219
  std::vector<llama_model_kv_override> kv_overrides;
128
220
 
129
- // TODO: avoid tuple, use struct
130
- std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
221
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
222
+ std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
131
223
 
132
- std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
224
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
133
225
 
134
226
  int32_t verbosity = 0;
135
227
  int32_t control_vector_layer_start = -1; // layer range for control vector
@@ -164,15 +256,15 @@ struct gpt_params {
164
256
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
165
257
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
166
258
  bool flash_attn = false; // flash attention
259
+ bool no_perf = false; // disable performance metrics
260
+ bool ctx_shift = true; // context shift on inifinite text generation
167
261
 
168
262
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
169
- bool ignore_eos = false; // ignore generated EOS tokens
170
263
  bool logits_all = false; // return logits for all tokens in the batch
171
264
  bool use_mmap = true; // use mmap for faster loads
172
265
  bool use_mlock = false; // use mlock to keep model in memory
173
266
  bool verbose_prompt = false; // print prompt tokens before generation
174
267
  bool display_prompt = true; // print prompt before generation
175
- bool infill = false; // use infill mode
176
268
  bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
177
269
  bool no_kv_offload = false; // disable KV offloading
178
270
  bool warmup = true; // warmup run
@@ -182,33 +274,37 @@ struct gpt_params {
182
274
  std::string cache_type_v = "f16"; // KV cache data type for the V
183
275
 
184
276
  // multimodal models (see examples/llava)
185
- std::string mmproj = ""; // path to multimodal projector
277
+ std::string mmproj = ""; // path to multimodal projector // NOLINT
186
278
  std::vector<std::string> image; // path to image file(s)
187
279
 
188
280
  // embedding
189
281
  bool embedding = false; // get only sentence embedding
190
- int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
282
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
191
283
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
192
- std::string embd_sep = "\n"; // separator of embendings
284
+ std::string embd_sep = "\n"; // separator of embeddings
285
+ bool reranking = false; // enable reranking support on server
193
286
 
194
287
  // server params
195
288
  int32_t port = 8080; // server listens on this network port
196
289
  int32_t timeout_read = 600; // http read timeout in seconds
197
290
  int32_t timeout_write = timeout_read; // http write timeout in seconds
198
- int32_t n_threads_http = -1; // number of threads to process HTTP requests
291
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
292
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
199
293
 
200
294
  std::string hostname = "127.0.0.1";
201
- std::string public_path = "";
202
- std::string chat_template = "";
203
- std::string system_prompt = "";
295
+ std::string public_path = ""; // NOLINT
296
+ std::string chat_template = ""; // NOLINT
204
297
  bool enable_chat_template = true;
205
298
 
206
299
  std::vector<std::string> api_keys;
207
300
 
208
- std::string ssl_file_key = "";
209
- std::string ssl_file_cert = "";
301
+ std::string ssl_file_key = ""; // NOLINT
302
+ std::string ssl_file_cert = ""; // NOLINT
210
303
 
211
- bool endpoint_slots = true;
304
+ // "advanced" endpoints are disabled by default for better security
305
+ bool webui = true;
306
+ bool endpoint_slots = false;
307
+ bool endpoint_props = false; // only control POST requests, not GET
212
308
  bool endpoint_metrics = false;
213
309
 
214
310
  bool log_json = false;
@@ -256,29 +352,47 @@ struct gpt_params {
256
352
  bool spm_infill = false; // suffix/prefix/middle pattern for infill
257
353
 
258
354
  std::string lora_outfile = "ggml-lora-merged-f16.gguf";
355
+
356
+ // batched-bench params
357
+ bool batched_bench_output_jsonl = false;
259
358
  };
260
359
 
261
- void gpt_params_handle_hf_token(gpt_params & params);
262
- void gpt_params_handle_model_default(gpt_params & params);
360
+ // call once at the start of a program if it uses libcommon
361
+ // initializes the logging system and prints info about the build
362
+ void common_init();
263
363
 
264
- bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
265
- bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
266
- bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
267
- void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
364
+ std::string common_params_get_system_info(const common_params & params);
268
365
 
269
- std::string gpt_params_get_system_info(const gpt_params & params);
366
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
367
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
368
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
369
+ bool set_process_priority(enum ggml_sched_priority prio);
270
370
 
271
371
  //
272
372
  // String utils
273
373
  //
274
374
 
275
- std::vector<std::string> string_split(std::string input, char separator);
375
+ #ifdef __GNUC__
376
+ #ifdef __MINGW32__
377
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
378
+ #else
379
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
380
+ #endif
381
+ #else
382
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
383
+ #endif
384
+
385
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
386
+ std::string string_format(const char * fmt, ...);
276
387
 
277
388
  std::string string_strip(const std::string & str);
278
389
  std::string string_get_sortable_timestamp();
279
390
 
391
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
392
+
280
393
  template<class T>
281
394
  static std::vector<T> string_split(const std::string & str, char delim) {
395
+ static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
282
396
  std::vector<T> values;
283
397
  std::istringstream str_stream(str);
284
398
  std::string token;
@@ -291,9 +405,30 @@ static std::vector<T> string_split(const std::string & str, char delim) {
291
405
  return values;
292
406
  }
293
407
 
408
+ template<>
409
+ std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
410
+ {
411
+ std::vector<std::string> parts;
412
+ size_t begin_pos = 0;
413
+ size_t separator_pos = input.find(separator);
414
+ while (separator_pos != std::string::npos) {
415
+ std::string part = input.substr(begin_pos, separator_pos - begin_pos);
416
+ parts.emplace_back(part);
417
+ begin_pos = separator_pos + 1;
418
+ separator_pos = input.find(separator, begin_pos);
419
+ }
420
+ parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
421
+ return parts;
422
+ }
423
+
294
424
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
295
425
  void string_process_escapes(std::string & input);
296
426
 
427
+ std::string string_from(bool value);
428
+ std::string string_from(const std::vector<int> & values);
429
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
430
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
431
+
297
432
  //
298
433
  // Filesystem utils
299
434
  //
@@ -308,20 +443,29 @@ std::string fs_get_cache_file(const std::string & filename);
308
443
  // Model utils
309
444
  //
310
445
 
311
- // TODO: avoid tuplue, use struct
312
- std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
446
+ struct common_init_result {
447
+ struct llama_model * model = nullptr;
448
+ struct llama_context * context = nullptr;
449
+ std::vector<common_lora_adapter_container> lora_adapters;
450
+ };
451
+
452
+ struct common_init_result common_init_from_params(common_params & params);
453
+
454
+ struct llama_model_params common_model_params_to_llama (const common_params & params);
455
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
456
+ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
313
457
 
314
- struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
315
- struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
458
+ struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
459
+ struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
316
460
 
317
- struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
318
- struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
461
+ // clear LoRA adapters from context, then apply new list of adapters
462
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
319
463
 
320
464
  // Batch utils
321
465
 
322
- void llama_batch_clear(struct llama_batch & batch);
466
+ void common_batch_clear(struct llama_batch & batch);
323
467
 
324
- void llama_batch_add(
468
+ void common_batch_add(
325
469
  struct llama_batch & batch,
326
470
  llama_token id,
327
471
  llama_pos pos,
@@ -334,13 +478,13 @@ void llama_batch_add(
334
478
 
335
479
  // tokenizes a string into a vector of tokens
336
480
  // should work similar to Python's `tokenizer.encode`
337
- std::vector<llama_token> llama_tokenize(
481
+ std::vector<llama_token> common_tokenize(
338
482
  const struct llama_context * ctx,
339
483
  const std::string & text,
340
484
  bool add_special,
341
485
  bool parse_special = false);
342
486
 
343
- std::vector<llama_token> llama_tokenize(
487
+ std::vector<llama_token> common_tokenize(
344
488
  const struct llama_model * model,
345
489
  const std::string & text,
346
490
  bool add_special,
@@ -348,7 +492,7 @@ std::vector<llama_token> llama_tokenize(
348
492
 
349
493
  // tokenizes a token into a piece, optionally renders special/control tokens
350
494
  // should work similar to Python's `tokenizer.id_to_piece`
351
- std::string llama_token_to_piece(
495
+ std::string common_token_to_piece(
352
496
  const struct llama_context * ctx,
353
497
  llama_token token,
354
498
  bool special = true);
@@ -356,45 +500,41 @@ std::string llama_token_to_piece(
356
500
  // detokenizes a vector of tokens into a string
357
501
  // should work similar to Python's `tokenizer.decode`
358
502
  // optionally renders special/control tokens
359
- std::string llama_detokenize(
503
+ std::string common_detokenize(
360
504
  llama_context * ctx,
361
505
  const std::vector<llama_token> & tokens,
362
506
  bool special = true);
363
507
 
364
- // Uses the value from the model metadata if possible, otherwise
365
- // defaults to true when model type is SPM, otherwise false.
366
- bool llama_should_add_bos_token(const llama_model * model);
367
-
368
508
  //
369
509
  // Chat template utils
370
510
  //
371
511
 
372
512
  // same with llama_chat_message, but uses std::string
373
- struct llama_chat_msg {
513
+ struct common_chat_msg {
374
514
  std::string role;
375
515
  std::string content;
376
516
  };
377
517
 
378
518
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
379
- bool llama_chat_verify_template(const std::string & tmpl);
519
+ bool common_chat_verify_template(const std::string & tmpl);
380
520
 
381
521
  // CPP wrapper for llama_chat_apply_template
382
522
  // If the built-in template is not supported, we default to chatml
383
523
  // If the custom "tmpl" is not supported, we throw an error
384
- std::string llama_chat_apply_template(const struct llama_model * model,
524
+ std::string common_chat_apply_template(const struct llama_model * model,
385
525
  const std::string & tmpl,
386
- const std::vector<llama_chat_msg> & chat,
526
+ const std::vector<common_chat_msg> & chat,
387
527
  bool add_ass);
388
528
 
389
529
  // Format single message, while taking into account the position of that message in chat history
390
- std::string llama_chat_format_single(const struct llama_model * model,
530
+ std::string common_chat_format_single(const struct llama_model * model,
391
531
  const std::string & tmpl,
392
- const std::vector<llama_chat_msg> & past_msg,
393
- const llama_chat_msg & new_msg,
532
+ const std::vector<common_chat_msg> & past_msg,
533
+ const common_chat_msg & new_msg,
394
534
  bool add_ass);
395
535
 
396
536
  // Returns an example of formatted chat
397
- std::string llama_chat_format_example(const struct llama_model * model,
537
+ std::string common_chat_format_example(const struct llama_model * model,
398
538
  const std::string & tmpl);
399
539
 
400
540
  //
@@ -402,31 +542,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
402
542
  //
403
543
 
404
544
  // Dump the KV cache view with the number of sequences per cell.
405
- void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
545
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
406
546
 
407
547
  // Dump the KV cache view showing individual sequences in each cell (long output).
408
- void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
548
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
409
549
 
410
550
  //
411
551
  // Embedding utils
412
552
  //
413
553
 
414
- void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
554
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
415
555
 
416
- float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
556
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
417
557
 
418
558
  //
419
559
  // Control vector utils
420
560
  //
421
561
 
422
- struct llama_control_vector_data {
562
+ struct common_control_vector_data {
423
563
  int n_embd;
424
564
 
425
565
  // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
426
566
  std::vector<float> data;
427
567
  };
428
568
 
429
- struct llama_control_vector_load_info {
569
+ struct common_control_vector_load_info {
430
570
  float strength;
431
571
 
432
572
  std::string fname;
@@ -434,7 +574,7 @@ struct llama_control_vector_load_info {
434
574
 
435
575
  // Load control vectors, scale each by strength, and add them together.
436
576
  // On error, returns {-1, empty}
437
- llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
577
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
438
578
 
439
579
  //
440
580
  // Split utils
@@ -443,15 +583,3 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
443
583
  static const char * const LLM_KV_SPLIT_NO = "split.no";
444
584
  static const char * const LLM_KV_SPLIT_COUNT = "split.count";
445
585
  static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
446
-
447
- //
448
- // YAML utils
449
- //
450
-
451
- void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
452
- void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
453
- void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
454
-
455
- void yaml_dump_non_result_info(
456
- FILE * stream, const gpt_params & params, const llama_context * lctx,
457
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
@@ -94,6 +94,9 @@ namespace console {
94
94
  simple_io = true;
95
95
  }
96
96
  }
97
+ if (simple_io) {
98
+ _setmode(_fileno(stdin), _O_U8TEXT);
99
+ }
97
100
  #else
98
101
  // POSIX-specific console initialization
99
102
  if (!simple_io) {
@@ -611,7 +611,7 @@ private:
611
611
  }
612
612
  return join_seq();
613
613
  };
614
- return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
614
+ return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
615
615
  }
616
616
 
617
617
  /*