cui-llama.rn 1.4.4 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (216) hide show
  1. package/android/src/main/CMakeLists.txt +9 -2
  2. package/android/src/main/jni.cpp +54 -34
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/binary-ops.cpp +158 -0
  12. package/cpp/binary-ops.h +16 -0
  13. package/cpp/chat.cpp +1769 -1085
  14. package/cpp/chat.h +143 -0
  15. package/cpp/common.cpp +1562 -1996
  16. package/cpp/common.h +677 -744
  17. package/cpp/cpu-common.h +72 -0
  18. package/cpp/ggml-alloc.c +1039 -1030
  19. package/cpp/ggml-alloc.h +1 -1
  20. package/cpp/ggml-backend-impl.h +255 -255
  21. package/cpp/ggml-backend-reg.cpp +586 -582
  22. package/cpp/ggml-backend.cpp +2004 -2002
  23. package/cpp/ggml-backend.h +354 -354
  24. package/cpp/ggml-common.h +1857 -1851
  25. package/cpp/ggml-cpp.h +39 -39
  26. package/cpp/ggml-cpu-aarch64.cpp +5725 -4247
  27. package/cpp/ggml-cpu-aarch64.h +8 -8
  28. package/cpp/ggml-cpu-impl.h +512 -380
  29. package/cpp/ggml-cpu-quants.c +13026 -11517
  30. package/cpp/ggml-cpu-traits.cpp +36 -36
  31. package/cpp/ggml-cpu-traits.h +38 -38
  32. package/cpp/ggml-cpu.c +3438 -14485
  33. package/cpp/ggml-cpu.cpp +655 -633
  34. package/cpp/ggml-cpu.h +138 -135
  35. package/cpp/ggml-impl.h +594 -567
  36. package/cpp/ggml-metal-impl.h +312 -3
  37. package/cpp/ggml-metal.h +66 -66
  38. package/cpp/ggml-metal.m +5360 -5002
  39. package/cpp/ggml-opt.cpp +854 -854
  40. package/cpp/ggml-opt.h +216 -216
  41. package/cpp/ggml-quants.c +5238 -5238
  42. package/cpp/ggml-threading.h +14 -14
  43. package/cpp/ggml.c +6618 -6524
  44. package/cpp/ggml.h +2222 -2194
  45. package/cpp/gguf.cpp +1330 -1329
  46. package/cpp/gguf.h +202 -202
  47. package/cpp/json-schema-to-grammar.cpp +1024 -1025
  48. package/cpp/json-schema-to-grammar.h +21 -22
  49. package/cpp/json.hpp +24766 -24766
  50. package/cpp/llama-adapter.cpp +382 -347
  51. package/cpp/llama-adapter.h +76 -74
  52. package/cpp/llama-arch.cpp +1714 -1492
  53. package/cpp/llama-arch.h +428 -402
  54. package/cpp/llama-batch.cpp +368 -368
  55. package/cpp/llama-batch.h +88 -88
  56. package/cpp/llama-chat.cpp +640 -587
  57. package/cpp/llama-chat.h +56 -53
  58. package/cpp/llama-context.cpp +2831 -1775
  59. package/cpp/llama-context.h +265 -128
  60. package/cpp/llama-cparams.cpp +1 -1
  61. package/cpp/llama-cparams.h +38 -37
  62. package/cpp/llama-cpp.h +30 -30
  63. package/cpp/llama-grammar.cpp +1219 -1219
  64. package/cpp/llama-grammar.h +173 -164
  65. package/cpp/llama-graph.cpp +1695 -0
  66. package/cpp/llama-graph.h +592 -0
  67. package/cpp/llama-hparams.cpp +79 -71
  68. package/cpp/llama-hparams.h +156 -139
  69. package/cpp/llama-impl.cpp +167 -167
  70. package/cpp/llama-impl.h +61 -61
  71. package/cpp/llama-io.cpp +15 -0
  72. package/cpp/llama-io.h +35 -0
  73. package/cpp/llama-kv-cache.cpp +1380 -718
  74. package/cpp/llama-kv-cache.h +213 -218
  75. package/cpp/llama-memory.cpp +1 -0
  76. package/cpp/llama-memory.h +21 -0
  77. package/cpp/llama-mmap.cpp +600 -590
  78. package/cpp/llama-mmap.h +68 -68
  79. package/cpp/llama-model-loader.cpp +1129 -1124
  80. package/cpp/llama-model-loader.h +169 -167
  81. package/cpp/llama-model.cpp +13080 -4023
  82. package/cpp/llama-model.h +409 -370
  83. package/cpp/llama-sampling.cpp +2563 -2525
  84. package/cpp/llama-sampling.h +32 -32
  85. package/cpp/llama-vocab.cpp +3295 -3252
  86. package/cpp/llama-vocab.h +125 -125
  87. package/cpp/llama.cpp +351 -10137
  88. package/cpp/llama.h +1434 -1340
  89. package/cpp/log.cpp +427 -423
  90. package/cpp/log.h +132 -132
  91. package/cpp/{chat-template.hpp → minja/chat-template.hpp} +537 -529
  92. package/cpp/{minja.hpp → minja/minja.hpp} +2941 -2883
  93. package/cpp/ops.cpp +8723 -0
  94. package/cpp/ops.h +128 -0
  95. package/cpp/rn-llama.cpp +45 -71
  96. package/cpp/rn-llama.h +3 -3
  97. package/cpp/sampling.cpp +573 -532
  98. package/cpp/sgemm.cpp +3043 -2598
  99. package/cpp/sgemm.h +14 -14
  100. package/cpp/simd-mappings.h +888 -0
  101. package/cpp/speculative.cpp +278 -277
  102. package/cpp/speculative.h +28 -28
  103. package/cpp/unary-ops.cpp +186 -0
  104. package/cpp/unary-ops.h +28 -0
  105. package/cpp/vec.cpp +258 -0
  106. package/cpp/vec.h +802 -0
  107. package/ios/CMakeLists.txt +5 -2
  108. package/ios/RNLlama.mm +2 -2
  109. package/ios/RNLlamaContext.mm +40 -24
  110. package/package.json +1 -1
  111. package/src/NativeRNLlama.ts +6 -4
  112. package/src/index.ts +3 -1
  113. package/android/src/main/build-arm64/CMakeCache.txt +0 -429
  114. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  115. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +0 -101
  116. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  117. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  118. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  119. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  120. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  121. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  122. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  123. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -431
  124. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +0 -16
  125. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +0 -165
  126. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +0 -297
  127. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +0 -1
  128. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +0 -1
  129. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +0 -1
  130. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +0 -1
  131. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +0 -1
  132. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +0 -1
  133. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +0 -1
  134. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +0 -8
  135. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +0 -1
  136. package/android/src/main/build-arm64/CMakeFiles/progress.marks +0 -1
  137. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  138. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +0 -58
  139. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  140. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +0 -756
  141. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  142. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +0 -709
  143. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  144. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +0 -714
  145. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  146. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +0 -62
  147. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  148. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +0 -708
  149. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  150. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +0 -113
  151. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  152. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +0 -713
  153. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  154. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +0 -763
  155. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  156. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +0 -61
  157. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  158. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +0 -707
  159. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  160. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +0 -104
  161. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  162. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +0 -714
  163. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  164. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +0 -723
  165. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +0 -62
  166. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +0 -722
  167. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +0 -89
  168. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +0 -2
  169. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +0 -2
  170. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +0 -2
  171. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +0 -17
  172. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +0 -41
  173. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +0 -62
  174. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +0 -722
  175. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +0 -89
  176. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +0 -2
  177. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +0 -2
  178. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +0 -2
  179. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +0 -17
  180. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +0 -41
  181. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +0 -62
  182. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +0 -722
  183. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +0 -89
  184. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +0 -2
  185. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +0 -2
  186. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +0 -2
  187. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +0 -17
  188. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +0 -41
  189. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +0 -62
  190. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +0 -722
  191. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +0 -89
  192. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +0 -2
  193. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +0 -2
  194. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +0 -2
  195. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +0 -17
  196. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +0 -41
  197. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +0 -62
  198. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +0 -722
  199. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +0 -89
  200. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +0 -2
  201. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +0 -2
  202. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +0 -2
  203. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +0 -17
  204. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +0 -41
  205. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +0 -62
  206. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +0 -722
  207. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +0 -89
  208. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +0 -2
  209. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +0 -2
  210. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +0 -2
  211. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +0 -17
  212. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +0 -41
  213. package/android/src/main/build-arm64/Makefile +0 -1862
  214. package/android/src/main/build-arm64/cmake_install.cmake +0 -66
  215. package/cpp/chat.hpp +0 -55
  216. package/cpp/rn-llama.hpp +0 -913
package/cpp/common.h CHANGED
@@ -1,744 +1,677 @@
1
- // Various helper functions and utilities
2
-
3
- #pragma once
4
-
5
- #include "llama-cpp.h"
6
-
7
- #include <set>
8
- #include <string>
9
- #include <vector>
10
- #include <sstream>
11
-
12
- #ifdef _WIN32
13
- #define DIRECTORY_SEPARATOR '\\'
14
- #else
15
- #define DIRECTORY_SEPARATOR '/'
16
- #endif // _WIN32
17
-
18
- #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
19
- #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
20
-
21
- #define print_build_info() do { \
22
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
23
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
24
- } while(0)
25
-
26
- #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
27
-
28
- struct common_adapter_lora_info {
29
- std::string path;
30
- float scale;
31
-
32
- struct llama_adapter_lora * ptr;
33
- };
34
-
35
- using llama_tokens = std::vector<llama_token>;
36
-
37
- // build info
38
- extern int LLAMA_BUILD_NUMBER;
39
- extern const char * LLAMA_COMMIT;
40
- extern const char * LLAMA_COMPILER;
41
- extern const char * LLAMA_BUILD_TARGET;
42
-
43
- struct common_control_vector_load_info;
44
-
45
- #define print_build_info() do { \
46
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
47
- fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
48
- } while(0)
49
-
50
- // build info
51
- extern int LLAMA_BUILD_NUMBER;
52
- extern char const *LLAMA_COMMIT;
53
- extern char const *LLAMA_COMPILER;
54
- extern char const *LLAMA_BUILD_TARGET;
55
-
56
- //
57
- // CPU utils
58
- //
59
-
60
- struct cpu_params {
61
- int n_threads = -1;
62
- bool cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
63
- bool mask_valid = false; // Default: any CPU
64
- enum lm_ggml_sched_priority priority = LM_GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
65
- bool strict_cpu = false; // Use strict CPU placement
66
- uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
67
- };
68
-
69
- int32_t cpu_get_num_physical_cores();
70
- int32_t cpu_get_num_math();
71
-
72
- //
73
- // Common params
74
- //
75
-
76
- enum llama_example {
77
- LLAMA_EXAMPLE_COMMON,
78
- LLAMA_EXAMPLE_SPECULATIVE,
79
- LLAMA_EXAMPLE_MAIN,
80
- LLAMA_EXAMPLE_INFILL,
81
- LLAMA_EXAMPLE_EMBEDDING,
82
- LLAMA_EXAMPLE_PERPLEXITY,
83
- LLAMA_EXAMPLE_RETRIEVAL,
84
- LLAMA_EXAMPLE_PASSKEY,
85
- LLAMA_EXAMPLE_IMATRIX,
86
- LLAMA_EXAMPLE_BENCH,
87
- LLAMA_EXAMPLE_SERVER,
88
- LLAMA_EXAMPLE_CVECTOR_GENERATOR,
89
- LLAMA_EXAMPLE_EXPORT_LORA,
90
- LLAMA_EXAMPLE_LLAVA,
91
- LLAMA_EXAMPLE_LOOKUP,
92
- LLAMA_EXAMPLE_PARALLEL,
93
- LLAMA_EXAMPLE_TTS,
94
-
95
- LLAMA_EXAMPLE_COUNT,
96
- };
97
-
98
- enum common_sampler_type {
99
- COMMON_SAMPLER_TYPE_NONE = 0,
100
- COMMON_SAMPLER_TYPE_DRY = 1,
101
- COMMON_SAMPLER_TYPE_TOP_K = 2,
102
- COMMON_SAMPLER_TYPE_TOP_P = 3,
103
- COMMON_SAMPLER_TYPE_MIN_P = 4,
104
- //COMMON_SAMPLER_TYPE_TFS_Z = 5,
105
- COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
106
- COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
107
- COMMON_SAMPLER_TYPE_XTC = 8,
108
- COMMON_SAMPLER_TYPE_INFILL = 9,
109
- COMMON_SAMPLER_TYPE_PENALTIES = 10,
110
- };
111
-
112
- // dimensionality reduction methods, used by cvector-generator
113
- enum dimre_method {
114
- DIMRE_METHOD_PCA,
115
- DIMRE_METHOD_MEAN,
116
- };
117
-
118
- enum common_conversation_mode {
119
- COMMON_CONVERSATION_MODE_DISABLED = 0,
120
- COMMON_CONVERSATION_MODE_ENABLED = 1,
121
- COMMON_CONVERSATION_MODE_AUTO = 2,
122
- };
123
-
124
- struct common_grammar_trigger {
125
- std::string word;
126
- bool at_start;
127
- };
128
-
129
- // sampling parameters
130
- struct common_params_sampling {
131
- uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
132
-
133
- int32_t n_prev = 64; // number of previous tokens to remember
134
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
135
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
136
- int32_t top_k = 40; // <= 0 to use vocab size
137
- float top_p = 0.95f; // 1.0 = disabled
138
- float min_p = 0.05f; // 0.0 = disabled
139
- float xtc_probability = 0.00f; // 0.0 = disabled
140
- float xtc_threshold = 0.10f; // > 0.5 disables XTC
141
- float typ_p = 1.00f; // typical_p, 1.0 = disabled
142
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
143
- float dynatemp_range = 0.00f; // 0.0 = disabled
144
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
145
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
146
- float penalty_repeat = 1.00f; // 1.0 = disabled
147
- float penalty_freq = 0.00f; // 0.0 = disabled
148
- float penalty_present = 0.00f; // 0.0 = disabled
149
- float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
150
- float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
151
- int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
152
- int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
153
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
154
- float top_n_sigma = -1.00f;// -1.0 = disabled
155
- float mirostat_tau = 5.00f; // target entropy
156
- float mirostat_eta = 0.10f; // learning rate
157
- bool ignore_eos = false;
158
- bool no_perf = false; // disable performance metrics
159
- bool timing_per_token = false;
160
-
161
- std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
162
-
163
-
164
- std::vector<enum common_sampler_type> samplers = {
165
- COMMON_SAMPLER_TYPE_PENALTIES,
166
- COMMON_SAMPLER_TYPE_DRY,
167
- COMMON_SAMPLER_TYPE_TOP_K,
168
- COMMON_SAMPLER_TYPE_TYPICAL_P,
169
- COMMON_SAMPLER_TYPE_TOP_P,
170
- COMMON_SAMPLER_TYPE_MIN_P,
171
- COMMON_SAMPLER_TYPE_XTC,
172
- COMMON_SAMPLER_TYPE_TEMPERATURE,
173
- };
174
-
175
- std::string grammar; // optional BNF-like grammar to constrain sampling
176
- bool grammar_lazy = false;
177
- std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
178
- std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
179
- std::set<llama_token> preserved_tokens;
180
-
181
- std::vector<llama_logit_bias> logit_bias; // logit biases to apply
182
-
183
- // print the parameters into a string
184
- std::string print() const;
185
- };
186
-
187
- struct common_params_speculative {
188
- std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
189
-
190
- int32_t n_ctx = 0; // draft context size
191
- int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
192
- int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
193
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
194
- float p_split = 0.1f; // speculative decoding split probability
195
- float p_min = 0.9f; // minimum speculative decoding probability (greedy)
196
-
197
- struct cpu_params cpuparams;
198
- struct cpu_params cpuparams_batch;
199
-
200
- std::string hf_repo = ""; // HF repo // NOLINT
201
- std::string hf_file = ""; // HF file // NOLINT
202
-
203
- std::string model = ""; // draft model for speculative decoding // NOLINT
204
- std::string model_url = ""; // model url to download // NOLINT
205
- };
206
-
207
- struct common_params_vocoder {
208
- std::string hf_repo = ""; // HF repo // NOLINT
209
- std::string hf_file = ""; // HF file // NOLINT
210
-
211
- std::string model = ""; // model path // NOLINT
212
- std::string model_url = ""; // model url to download // NOLINT
213
-
214
- bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
215
- };
216
-
217
- enum common_reasoning_format {
218
- COMMON_REASONING_FORMAT_NONE,
219
- COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
220
- };
221
-
222
- struct common_params {
223
-
224
- void * progress_callback_user_data = nullptr;
225
- llama_progress_callback progress_callback = nullptr;
226
- bool vocab_only = false;
227
- int32_t n_predict = -1; // new tokens to predict
228
- int32_t n_ctx = 4096; // context size
229
- int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
230
- int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
231
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
232
- int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
233
- int32_t n_parallel = 1; // number of parallel sequences to decode
234
- int32_t n_sequences = 1; // number of sequences to decode
235
- int32_t grp_attn_n = 1; // group-attention factor
236
- int32_t grp_attn_w = 512; // group-attention width
237
- int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
238
- float rope_freq_base = 0.0f; // RoPE base frequency
239
- float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
240
- float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
241
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
242
- float yarn_beta_fast = 32.0f; // YaRN low correction dim
243
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
244
- int32_t yarn_orig_ctx = 0; // YaRN original context length
245
- float defrag_thold = 0.1f; // KV cache defragmentation threshold
246
-
247
- // offload params
248
- std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
249
-
250
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
251
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
252
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
253
-
254
- enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
255
-
256
- struct cpu_params cpuparams;
257
- struct cpu_params cpuparams_batch;
258
-
259
- lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
260
- void * cb_eval_user_data = nullptr;
261
-
262
- lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
263
-
264
- enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
265
- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
266
- enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
267
-
268
- struct common_params_sampling sampling;
269
- struct common_params_speculative speculative;
270
- struct common_params_vocoder vocoder;
271
-
272
- std::string model = ""; // model path // NOLINT
273
- std::string model_alias = ""; // model alias // NOLINT
274
- std::string model_url = ""; // model url to download // NOLINT
275
- std::string hf_token = ""; // HF token // NOLINT
276
- std::string hf_repo = ""; // HF repo // NOLINT
277
- std::string hf_file = ""; // HF file // NOLINT
278
- std::string prompt = ""; // NOLINT
279
- std::string prompt_file = ""; // store the external prompt file name // NOLINT
280
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
281
- std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
282
- std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
283
- std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
284
- std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
285
- std::string logits_file = ""; // file for saving *all* logits // NOLINT
286
-
287
- std::vector<std::string> in_files; // all input files
288
- std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
289
- std::vector<llama_model_kv_override> kv_overrides;
290
-
291
- bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
292
- std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
293
-
294
- std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
295
-
296
- int32_t verbosity = 0;
297
- int32_t control_vector_layer_start = -1; // layer range for control vector
298
- int32_t control_vector_layer_end = -1; // layer range for control vector
299
-
300
- int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
301
- int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
302
- // (which is more convenient to use for plotting)
303
- //
304
- bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
305
- size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
306
-
307
- bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
308
- size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
309
-
310
- bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
311
- size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
312
-
313
- bool kl_divergence = false; // compute KL divergence
314
-
315
- bool usage = false; // print usage
316
- bool completion = false; // print source-able completion script
317
- bool use_color = false; // use color to distinguish generations and inputs
318
- bool special = false; // enable special token output
319
- bool interactive = false; // interactive mode
320
- bool interactive_first = false; // wait for user input immediately
321
- bool prompt_cache_all = false; // save user input and generations to prompt cache
322
- bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
323
-
324
- bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
325
- bool multiline_input = false; // reverse the usage of `\`
326
- bool simple_io = false; // improves compatibility with subprocesses and limited consoles
327
- bool cont_batching = true; // insert new sequences for decoding on-the-fly
328
- bool flash_attn = false; // flash attention
329
- bool no_perf = false; // disable performance metrics
330
- bool ctx_shift = true; // context shift on inifinite text generation
331
-
332
- bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
333
- bool logits_all = false; // return logits for all tokens in the batch
334
- bool use_mmap = true; // use mmap for faster loads
335
- bool use_mlock = false; // use mlock to keep model in memory
336
- bool verbose_prompt = false; // print prompt tokens before generation
337
- bool display_prompt = true; // print prompt before generation
338
- bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
339
- bool no_kv_offload = false; // disable KV offloading
340
- bool warmup = true; // warmup run
341
- bool check_tensors = false; // validate tensor data
342
-
343
- lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
344
- lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
345
-
346
- common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
347
-
348
- // multimodal models (see examples/llava)
349
- std::string mmproj = ""; // path to multimodal projector // NOLINT
350
- std::vector<std::string> image; // path to image file(s)
351
-
352
- // embedding
353
- bool embedding = false; // get only sentence embedding
354
- int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
355
- std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
356
- std::string embd_sep = "\n"; // separator of embeddings
357
- bool reranking = false; // enable reranking support on server
358
-
359
- // server params
360
- int32_t port = 8080; // server listens on this network port
361
- int32_t timeout_read = 600; // http read timeout in seconds
362
- int32_t timeout_write = timeout_read; // http write timeout in seconds
363
- int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
364
- int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
365
-
366
- std::string hostname = "127.0.0.1";
367
- std::string public_path = ""; // NOLINT
368
- std::string chat_template = ""; // NOLINT
369
- bool use_jinja = false; // NOLINT
370
- bool enable_chat_template = true;
371
- common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
372
-
373
- std::vector<std::string> api_keys;
374
-
375
- std::string ssl_file_key = ""; // NOLINT
376
- std::string ssl_file_cert = ""; // NOLINT
377
-
378
- // "advanced" endpoints are disabled by default for better security
379
- bool webui = true;
380
- bool endpoint_slots = false;
381
- bool endpoint_props = false; // only control POST requests, not GET
382
- bool endpoint_metrics = false;
383
-
384
- bool log_json = false;
385
-
386
- std::string slot_save_path;
387
-
388
- float slot_prompt_similarity = 0.5f;
389
-
390
- // batched-bench params
391
- bool is_pp_shared = false;
392
-
393
- std::vector<int32_t> n_pp;
394
- std::vector<int32_t> n_tg;
395
- std::vector<int32_t> n_pl;
396
-
397
- // retrieval params
398
- std::vector<std::string> context_files; // context files to embed
399
-
400
- int32_t chunk_size = 64; // chunk size for context embedding
401
-
402
- std::string chunk_separator = "\n"; // chunk separator for context embedding
403
-
404
- // passkey params
405
- int32_t n_junk = 250; // number of times to repeat the junk text
406
- int32_t i_pos = -1; // position of the passkey in the junk text
407
-
408
- // imatrix params
409
- std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
410
-
411
- int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
412
- int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
413
- int32_t i_chunk = 0; // start processing from this chunk
414
-
415
- bool process_output = false; // collect data for the output tensor
416
- bool compute_ppl = true; // whether to compute perplexity
417
-
418
- // cvector-generator params
419
- int n_pca_batch = 100;
420
- int n_pca_iterations = 1000;
421
- dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
422
- std::string cvector_outfile = "control_vector.gguf";
423
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
424
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
425
-
426
- bool spm_infill = false; // suffix/prefix/middle pattern for infill
427
-
428
- std::string lora_outfile = "ggml-lora-merged-f16.gguf";
429
-
430
- // batched-bench params
431
- bool batched_bench_output_jsonl = false;
432
- };
433
-
434
- // call once at the start of a program if it uses libcommon
435
- // initializes the logging system and prints info about the build
436
- void common_init();
437
-
438
- std::string common_params_get_system_info(const common_params & params);
439
-
440
- bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
441
- bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
442
- void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
443
- bool set_process_priority(enum lm_ggml_sched_priority prio);
444
-
445
- //
446
- // String utils
447
- //
448
-
449
- #ifdef __GNUC__
450
- # if defined(__MINGW32__) && !defined(__clang__)
451
- # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
452
- # else
453
- # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
454
- # endif
455
- #else
456
- # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
457
- #endif
458
-
459
- LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
460
- std::string string_format(const char * fmt, ...);
461
-
462
- std::string string_strip(const std::string & str);
463
- std::string string_get_sortable_timestamp();
464
-
465
- std::string string_join(const std::vector<std::string> & values, const std::string & separator);
466
- std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
467
- std::string string_repeat(const std::string & str, size_t n);
468
-
469
- void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
470
-
471
- template<class T>
472
- static std::vector<T> string_split(const std::string & str, char delim) {
473
- static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
474
- std::vector<T> values;
475
- std::istringstream str_stream(str);
476
- std::string token;
477
- while (std::getline(str_stream, token, delim)) {
478
- T value;
479
- std::istringstream token_stream(token);
480
- token_stream >> value;
481
- values.push_back(value);
482
- }
483
- return values;
484
- }
485
-
486
- template<>
487
- std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
488
- {
489
- std::vector<std::string> parts;
490
- size_t begin_pos = 0;
491
- size_t separator_pos = input.find(separator);
492
- while (separator_pos != std::string::npos) {
493
- std::string part = input.substr(begin_pos, separator_pos - begin_pos);
494
- parts.emplace_back(part);
495
- begin_pos = separator_pos + 1;
496
- separator_pos = input.find(separator, begin_pos);
497
- }
498
- parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
499
- return parts;
500
- }
501
-
502
- static bool string_starts_with(const std::string & str,
503
- const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
504
- return str.rfind(prefix, 0) == 0;
505
- }
506
-
507
- static bool string_ends_with(const std::string & str,
508
- const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
509
- return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
510
- }
511
-
512
- bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
513
- void string_process_escapes(std::string & input);
514
-
515
- std::string string_from(bool value);
516
- std::string string_from(const std::vector<int> & values);
517
- std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
518
- std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
519
-
520
- //
521
- // Filesystem utils
522
- //
523
-
524
- bool fs_validate_filename(const std::string & filename);
525
- bool fs_create_directory_with_parents(const std::string & path);
526
-
527
- std::string fs_get_cache_directory();
528
- std::string fs_get_cache_file(const std::string & filename);
529
-
530
- //
531
- // Model utils
532
- //
533
-
534
- // note: defines object's lifetime
535
- struct common_init_result {
536
- llama_model_ptr model;
537
- llama_context_ptr context;
538
-
539
- std::vector<llama_adapter_lora_ptr> lora;
540
- };
541
-
542
- struct common_init_result common_init_from_params(common_params & params);
543
-
544
- struct llama_model_params common_model_params_to_llama ( common_params & params);
545
- struct llama_context_params common_context_params_to_llama(const common_params & params);
546
- struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
547
-
548
- struct llama_model * common_load_model_from_url(
549
- const std::string & model_url,
550
- const std::string & local_path,
551
- const std::string & hf_token,
552
- const struct llama_model_params & params);
553
-
554
- struct llama_model * common_load_model_from_hf(
555
- const std::string & repo,
556
- const std::string & remote_path,
557
- const std::string & local_path,
558
- const std::string & hf_token,
559
- const struct llama_model_params & params);
560
-
561
- std::pair<std::string, std::string> common_get_hf_file(
562
- const std::string & hf_repo_with_tag,
563
- const std::string & hf_token);
564
-
565
- std::pair<std::string, std::string> common_get_hf_file(
566
- const std::string & hf_repo_with_tag,
567
- const std::string & hf_token);
568
-
569
- // clear LoRA adapters from context, then apply new list of adapters
570
- void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
571
-
572
- //
573
- // Batch utils
574
- //
575
-
576
- void common_batch_clear(struct llama_batch & batch);
577
-
578
- void common_batch_add(
579
- struct llama_batch & batch,
580
- llama_token id,
581
- llama_pos pos,
582
- const std::vector<llama_seq_id> & seq_ids,
583
- bool logits);
584
-
585
- //
586
- // Token utils
587
- //
588
-
589
- // longest common prefix
590
- size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
591
-
592
- // longet common subsequence
593
- size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
594
-
595
- //
596
- // Vocab utils
597
- //
598
-
599
- // tokenizes a string into a vector of tokens
600
- // should work similar to Python's `tokenizer.encode`
601
- std::vector<llama_token> common_tokenize(
602
- const struct llama_context * ctx,
603
- const std::string & text,
604
- bool add_special,
605
- bool parse_special = false);
606
-
607
- std::vector<llama_token> common_tokenize(
608
- const struct llama_vocab * vocab,
609
- const std::string & text,
610
- bool add_special,
611
- bool parse_special = false);
612
-
613
- // tokenizes a token into a piece, optionally renders special/control tokens
614
- // should work similar to Python's `tokenizer.id_to_piece`
615
- std::string common_token_to_piece(
616
- const struct llama_context * ctx,
617
- llama_token token,
618
- bool special = true);
619
-
620
- std::string common_token_to_piece(
621
- const struct llama_vocab * vocab,
622
- llama_token token,
623
- bool special = true);
624
-
625
- // detokenizes a vector of tokens into a string
626
- // should work similar to Python's `tokenizer.decode`
627
- // optionally renders special/control tokens
628
- std::string common_detokenize(
629
- const struct llama_context * ctx,
630
- const std::vector<llama_token> & tokens,
631
- bool special = true);
632
-
633
- std::string common_detokenize(
634
- const struct llama_vocab * vocab,
635
- const std::vector<llama_token> & tokens,
636
- bool special = true);
637
-
638
- //
639
- // Chat template utils
640
- //
641
-
642
- struct common_tool_call {
643
- std::string name;
644
- std::string arguments;
645
- std::string id;
646
- };
647
-
648
- // same with llama_chat_message, but uses std::string
649
- struct common_chat_msg {
650
- std::string role;
651
- std::string content;
652
- std::vector<common_tool_call> tool_calls;
653
- std::string reasoning_content = "";
654
- };
655
-
656
- // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
657
- bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
658
-
659
- namespace minja {
660
- class chat_template;
661
- }
662
-
663
- typedef minja::chat_template common_chat_template;
664
-
665
- struct common_chat_templates {
666
- bool has_explicit_template; // Model had builtin template or template overridde was specified.
667
- std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
668
- std::unique_ptr<common_chat_template> template_tool_use;
669
- };
670
-
671
- // CPP wrapper for llama_chat_apply_template
672
- // If the built-in template is not supported, we default to chatml
673
- // If the custom "tmpl" is not supported, we throw an error
674
- std::string common_chat_apply_template(
675
- const common_chat_template & tmpl,
676
- const std::vector<common_chat_msg> & chat,
677
- bool add_ass,
678
- bool use_jinja);
679
-
680
- // Format single message, while taking into account the position of that message in chat history
681
- std::string common_chat_format_single(
682
- const common_chat_template & tmpl,
683
- const std::vector<common_chat_msg> & past_msg,
684
- const common_chat_msg & new_msg,
685
- bool add_ass,
686
- bool use_jinja);
687
-
688
- // Returns an example of formatted chat
689
- std::string common_chat_format_example(
690
- const common_chat_template & tmpl, bool use_jinja);
691
-
692
- common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
693
-
694
- //
695
- // KV cache utils
696
- //
697
-
698
- // Dump the KV cache view with the number of sequences per cell.
699
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
700
-
701
- // Dump the KV cache view showing individual sequences in each cell (long output).
702
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
703
-
704
- //
705
- // Embedding utils
706
- //
707
-
708
- // TODO: repace embd_norm with an enum
709
- void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
710
-
711
- float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
712
-
713
- //
714
- // Control vector utils
715
- //
716
-
717
- struct common_control_vector_data {
718
- int n_embd;
719
-
720
- // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
721
- std::vector<float> data;
722
- };
723
-
724
- struct common_control_vector_load_info {
725
- float strength;
726
-
727
- std::string fname;
728
- };
729
-
730
- // Load control vectors, scale each by strength, and add them together.
731
- // On error, returns {-1, empty}
732
- common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
733
-
734
- //
735
- // Split utils
736
- //
737
-
738
- namespace {
739
-
740
- const char * const LLM_KV_SPLIT_NO = "split.no";
741
- const char * const LLM_KV_SPLIT_COUNT = "split.count";
742
- const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
743
-
744
- }
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "llama-cpp.h"
6
+
7
+ #include <set>
8
+ #include <string>
9
+ #include <vector>
10
+ #include <sstream>
11
+
12
+ #ifdef _WIN32
13
+ #define DIRECTORY_SEPARATOR '\\'
14
+ #else
15
+ #define DIRECTORY_SEPARATOR '/'
16
+ #endif // _WIN32
17
+
18
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
19
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
20
+
21
+ #define print_build_info() do { \
22
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
23
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
24
+ } while(0)
25
+
26
+ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
27
+
28
+ struct common_adapter_lora_info {
29
+ std::string path;
30
+ float scale;
31
+
32
+ struct llama_adapter_lora * ptr;
33
+ };
34
+
35
+ using llama_tokens = std::vector<llama_token>;
36
+
37
+ // build info
38
+ extern int LLAMA_BUILD_NUMBER;
39
+ extern const char * LLAMA_COMMIT;
40
+ extern const char * LLAMA_COMPILER;
41
+ extern const char * LLAMA_BUILD_TARGET;
42
+
43
+ struct common_control_vector_load_info;
44
+
45
+ #define print_build_info() do { \
46
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
47
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
48
+ } while(0)
49
+
50
+ // build info
51
+ extern int LLAMA_BUILD_NUMBER;
52
+ extern char const *LLAMA_COMMIT;
53
+ extern char const *LLAMA_COMPILER;
54
+ extern char const *LLAMA_BUILD_TARGET;
55
+
56
+ //
57
+ // CPU utils
58
+ //
59
+
60
+ struct cpu_params {
61
+ int n_threads = -1;
62
+ bool cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
63
+ bool mask_valid = false; // Default: any CPU
64
+ enum lm_ggml_sched_priority priority = LM_GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
65
+ bool strict_cpu = false; // Use strict CPU placement
66
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
67
+ };
68
+
69
+ int32_t cpu_get_num_physical_cores();
70
+ int32_t cpu_get_num_math();
71
+
72
+ //
73
+ // Common params
74
+ //
75
+
76
+ enum llama_example {
77
+ LLAMA_EXAMPLE_COMMON,
78
+ LLAMA_EXAMPLE_SPECULATIVE,
79
+ LLAMA_EXAMPLE_MAIN,
80
+ LLAMA_EXAMPLE_INFILL,
81
+ LLAMA_EXAMPLE_EMBEDDING,
82
+ LLAMA_EXAMPLE_PERPLEXITY,
83
+ LLAMA_EXAMPLE_RETRIEVAL,
84
+ LLAMA_EXAMPLE_PASSKEY,
85
+ LLAMA_EXAMPLE_IMATRIX,
86
+ LLAMA_EXAMPLE_BENCH,
87
+ LLAMA_EXAMPLE_SERVER,
88
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
89
+ LLAMA_EXAMPLE_EXPORT_LORA,
90
+ LLAMA_EXAMPLE_LLAVA,
91
+ LLAMA_EXAMPLE_LOOKUP,
92
+ LLAMA_EXAMPLE_PARALLEL,
93
+ LLAMA_EXAMPLE_TTS,
94
+
95
+ LLAMA_EXAMPLE_COUNT,
96
+ };
97
+
98
+ enum common_sampler_type {
99
+ COMMON_SAMPLER_TYPE_NONE = 0,
100
+ COMMON_SAMPLER_TYPE_DRY = 1,
101
+ COMMON_SAMPLER_TYPE_TOP_K = 2,
102
+ COMMON_SAMPLER_TYPE_TOP_P = 3,
103
+ COMMON_SAMPLER_TYPE_MIN_P = 4,
104
+ //COMMON_SAMPLER_TYPE_TFS_Z = 5,
105
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
106
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
107
+ COMMON_SAMPLER_TYPE_XTC = 8,
108
+ COMMON_SAMPLER_TYPE_INFILL = 9,
109
+ COMMON_SAMPLER_TYPE_PENALTIES = 10,
110
+ };
111
+
112
+ // dimensionality reduction methods, used by cvector-generator
113
+ enum dimre_method {
114
+ DIMRE_METHOD_PCA,
115
+ DIMRE_METHOD_MEAN,
116
+ };
117
+
118
+ enum common_conversation_mode {
119
+ COMMON_CONVERSATION_MODE_DISABLED = 0,
120
+ COMMON_CONVERSATION_MODE_ENABLED = 1,
121
+ COMMON_CONVERSATION_MODE_AUTO = 2,
122
+ };
123
+
124
+ enum common_grammar_trigger_type {
125
+ COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
126
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
127
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
128
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
129
+ };
130
+
131
+ struct common_grammar_trigger {
132
+ common_grammar_trigger_type type;
133
+ std::string value;
134
+ llama_token token = LLAMA_TOKEN_NULL;
135
+ };
136
+
137
+ // sampling parameters
138
+ struct common_params_sampling {
139
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
140
+
141
+ int32_t n_prev = 64; // number of previous tokens to remember
142
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
143
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
144
+ int32_t top_k = 40; // <= 0 to use vocab size
145
+ float top_p = 0.95f; // 1.0 = disabled
146
+ float min_p = 0.05f; // 0.0 = disabled
147
+ float xtc_probability = 0.00f; // 0.0 = disabled
148
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
149
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
150
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
151
+ float dynatemp_range = 0.00f; // 0.0 = disabled
152
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
153
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
154
+ float penalty_repeat = 1.00f; // 1.0 = disabled
155
+ float penalty_freq = 0.00f; // 0.0 = disabled
156
+ float penalty_present = 0.00f; // 0.0 = disabled
157
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
158
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
159
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
160
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
161
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
162
+ float top_n_sigma = -1.00f;// -1.0 = disabled
163
+ float mirostat_tau = 5.00f; // target entropy
164
+ float mirostat_eta = 0.10f; // learning rate
165
+ bool ignore_eos = false;
166
+ bool no_perf = false; // disable performance metrics
167
+ bool timing_per_token = false;
168
+
169
+ std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
170
+
171
+
172
+ std::vector<enum common_sampler_type> samplers = {
173
+ COMMON_SAMPLER_TYPE_PENALTIES,
174
+ COMMON_SAMPLER_TYPE_DRY,
175
+ COMMON_SAMPLER_TYPE_TOP_K,
176
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
177
+ COMMON_SAMPLER_TYPE_TOP_P,
178
+ COMMON_SAMPLER_TYPE_MIN_P,
179
+ COMMON_SAMPLER_TYPE_XTC,
180
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
181
+ };
182
+
183
+ std::string grammar; // optional BNF-like grammar to constrain sampling
184
+ bool grammar_lazy = false;
185
+ std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
186
+ std::set<llama_token> preserved_tokens;
187
+
188
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
189
+
190
+ // print the parameters into a string
191
+ std::string print() const;
192
+ };
193
+
194
+ struct common_params_model {
195
+ std::string path = ""; // model local path // NOLINT
196
+ std::string url = ""; // model url to download // NOLINT
197
+ std::string hf_repo = ""; // HF repo // NOLINT
198
+ std::string hf_file = ""; // HF file // NOLINT
199
+ };
200
+
201
+ struct common_params_speculative {
202
+ std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
203
+
204
+ int32_t n_ctx = 0; // draft context size
205
+ int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
206
+ int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
207
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
208
+ float p_split = 0.1f; // speculative decoding split probability
209
+ float p_min = 0.75f; // minimum speculative decoding probability (greedy)
210
+
211
+ struct cpu_params cpuparams;
212
+ struct cpu_params cpuparams_batch;
213
+
214
+ struct common_params_model model;
215
+ };
216
+
217
+ struct common_params_vocoder {
218
+ struct common_params_model model;
219
+
220
+ std::string speaker_file = ""; // speaker file path // NOLINT
221
+
222
+ bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
223
+ };
224
+
225
+ enum common_reasoning_format {
226
+ COMMON_REASONING_FORMAT_NONE,
227
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
228
+ };
229
+
230
+ struct common_params {
231
+
232
+ void * progress_callback_user_data = nullptr;
233
+ llama_progress_callback progress_callback = nullptr;
234
+ bool vocab_only = false;
235
+ int32_t n_predict = -1; // new tokens to predict
236
+ int32_t n_ctx = 4096; // context size
237
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
238
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
239
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
240
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
241
+ int32_t n_parallel = 1; // number of parallel sequences to decode
242
+ int32_t n_sequences = 1; // number of sequences to decode
243
+ int32_t grp_attn_n = 1; // group-attention factor
244
+ int32_t grp_attn_w = 512; // group-attention width
245
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
246
+ float rope_freq_base = 0.0f; // RoPE base frequency
247
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
248
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
249
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
250
+ float yarn_beta_fast = 32.0f; // YaRN low correction dim
251
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
252
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
253
+ float defrag_thold = 0.1f; // KV cache defragmentation threshold
254
+
255
+ // offload params
256
+ std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
257
+
258
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
259
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
260
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
261
+
262
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
263
+
264
+ struct cpu_params cpuparams;
265
+ struct cpu_params cpuparams_batch;
266
+
267
+ lm_ggml_backend_sched_eval_callback cb_eval = nullptr;
268
+ void * cb_eval_user_data = nullptr;
269
+
270
+ lm_ggml_numa_strategy numa = LM_GGML_NUMA_STRATEGY_DISABLED;
271
+
272
+ enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
273
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
274
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
275
+
276
+ struct common_params_sampling sampling;
277
+ struct common_params_speculative speculative;
278
+ struct common_params_vocoder vocoder;
279
+
280
+ struct common_params_model model;
281
+
282
+ std::string model_alias = ""; // model alias // NOLINT
283
+ std::string hf_token = ""; // HF token // NOLINT
284
+ std::string prompt = ""; // NOLINT
285
+ std::string system_prompt = ""; // NOLINT
286
+ std::string prompt_file = ""; // store the external prompt file name // NOLINT
287
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
288
+ std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
289
+ std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
290
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
291
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
292
+ std::string logits_file = ""; // file for saving *all* logits // NOLINT
293
+
294
+ std::vector<std::string> in_files; // all input files
295
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
296
+ std::vector<llama_model_kv_override> kv_overrides;
297
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
298
+
299
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
300
+ std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
301
+
302
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
303
+
304
+ int32_t verbosity = 0;
305
+ int32_t control_vector_layer_start = -1; // layer range for control vector
306
+ int32_t control_vector_layer_end = -1; // layer range for control vector
307
+
308
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
309
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
310
+ // (which is more convenient to use for plotting)
311
+ //
312
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
313
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
314
+
315
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
316
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
317
+
318
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
319
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
320
+
321
+ bool kl_divergence = false; // compute KL divergence
322
+
323
+ bool usage = false; // print usage
324
+ bool completion = false; // print source-able completion script
325
+ bool use_color = false; // use color to distinguish generations and inputs
326
+ bool special = false; // enable special token output
327
+ bool interactive = false; // interactive mode
328
+ bool interactive_first = false; // wait for user input immediately
329
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
330
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
331
+
332
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
333
+ bool multiline_input = false; // reverse the usage of `\`
334
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
335
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
336
+ bool flash_attn = false; // flash attention
337
+ bool no_perf = false; // disable performance metrics
338
+ bool ctx_shift = true; // context shift on inifinite text generation
339
+
340
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
341
+ bool logits_all = false; // return logits for all tokens in the batch
342
+ bool use_mmap = true; // use mmap for faster loads
343
+ bool use_mlock = false; // use mlock to keep model in memory
344
+ bool verbose_prompt = false; // print prompt tokens before generation
345
+ bool display_prompt = true; // print prompt before generation
346
+ bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
347
+ bool no_kv_offload = false; // disable KV offloading
348
+ bool warmup = true; // warmup run
349
+ bool check_tensors = false; // validate tensor data
350
+
351
+ bool single_turn = false; // single turn chat conversation
352
+
353
+ lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
354
+ lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
355
+
356
+ common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
357
+
358
+ // multimodal models (see examples/llava)
359
+ struct common_params_model mmproj;
360
+ std::vector<std::string> image; // path to image file(s)
361
+
362
+ // embedding
363
+ bool embedding = false; // get only sentence embedding
364
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
365
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
366
+ std::string embd_sep = "\n"; // separator of embeddings
367
+ bool reranking = false; // enable reranking support on server
368
+
369
+ // server params
370
+ int32_t port = 8080; // server listens on this network port
371
+ int32_t timeout_read = 600; // http read timeout in seconds
372
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
373
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
374
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
375
+
376
+ std::string hostname = "127.0.0.1";
377
+ std::string public_path = ""; // NOLINT
378
+ std::string chat_template = ""; // NOLINT
379
+ bool use_jinja = false; // NOLINT
380
+ bool enable_chat_template = true;
381
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
382
+
383
+ std::vector<std::string> api_keys;
384
+
385
+ std::string ssl_file_key = ""; // NOLINT
386
+ std::string ssl_file_cert = ""; // NOLINT
387
+
388
+ // "advanced" endpoints are disabled by default for better security
389
+ bool webui = true;
390
+ bool endpoint_slots = false;
391
+ bool endpoint_props = false; // only control POST requests, not GET
392
+ bool endpoint_metrics = false;
393
+
394
+ bool log_json = false;
395
+
396
+ std::string slot_save_path;
397
+
398
+ float slot_prompt_similarity = 0.5f;
399
+
400
+ // batched-bench params
401
+ bool is_pp_shared = false;
402
+
403
+ std::vector<int32_t> n_pp;
404
+ std::vector<int32_t> n_tg;
405
+ std::vector<int32_t> n_pl;
406
+
407
+ // retrieval params
408
+ std::vector<std::string> context_files; // context files to embed
409
+
410
+ int32_t chunk_size = 64; // chunk size for context embedding
411
+
412
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
413
+
414
+ // passkey params
415
+ int32_t n_junk = 250; // number of times to repeat the junk text
416
+ int32_t i_pos = -1; // position of the passkey in the junk text
417
+
418
+ // imatrix params
419
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
420
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
421
+ int32_t i_chunk = 0; // start processing from this chunk
422
+
423
+ bool process_output = false; // collect data for the output tensor
424
+ bool compute_ppl = true; // whether to compute perplexity
425
+
426
+ // cvector-generator params
427
+ int n_pca_batch = 100;
428
+ int n_pca_iterations = 1000;
429
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
430
+ std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
431
+ std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
432
+
433
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
434
+
435
+ // batched-bench params
436
+ bool batched_bench_output_jsonl = false;
437
+
438
+ // common params
439
+ std::string out_file; // output filename for all example programs
440
+ };
441
+
442
+ // call once at the start of a program if it uses libcommon
443
+ // initializes the logging system and prints info about the build
444
+ void common_init();
445
+
446
+ std::string common_params_get_system_info(const common_params & params);
447
+
448
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
449
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
450
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
451
+ bool set_process_priority(enum lm_ggml_sched_priority prio);
452
+
453
+ //
454
+ // String utils
455
+ //
456
+
457
+ #ifdef __GNUC__
458
+ # if defined(__MINGW32__) && !defined(__clang__)
459
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
460
+ # else
461
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
462
+ # endif
463
+ #else
464
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
465
+ #endif
466
+
467
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
468
+ std::string string_format(const char * fmt, ...);
469
+
470
+ std::string string_strip(const std::string & str);
471
+ std::string string_get_sortable_timestamp();
472
+
473
+ std::string string_join(const std::vector<std::string> & values, const std::string & separator);
474
+ std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
475
+ std::string string_repeat(const std::string & str, size_t n);
476
+
477
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
478
+
479
+ std::string regex_escape(const std::string & s);
480
+
481
+ template<class T>
482
+ static std::vector<T> string_split(const std::string & str, char delim) {
483
+ static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
484
+ std::vector<T> values;
485
+ std::istringstream str_stream(str);
486
+ std::string token;
487
+ while (std::getline(str_stream, token, delim)) {
488
+ T value;
489
+ std::istringstream token_stream(token);
490
+ token_stream >> value;
491
+ values.push_back(value);
492
+ }
493
+ return values;
494
+ }
495
+
496
+ template<>
497
+ std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
498
+ {
499
+ std::vector<std::string> parts;
500
+ size_t begin_pos = 0;
501
+ size_t separator_pos = input.find(separator);
502
+ while (separator_pos != std::string::npos) {
503
+ std::string part = input.substr(begin_pos, separator_pos - begin_pos);
504
+ parts.emplace_back(part);
505
+ begin_pos = separator_pos + 1;
506
+ separator_pos = input.find(separator, begin_pos);
507
+ }
508
+ parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
509
+ return parts;
510
+ }
511
+
512
+ static bool string_starts_with(const std::string & str,
513
+ const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
514
+ return str.rfind(prefix, 0) == 0;
515
+ }
516
+
517
+ static bool string_ends_with(const std::string & str,
518
+ const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
519
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
520
+ }
521
+
522
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
523
+ void string_process_escapes(std::string & input);
524
+
525
+ std::string string_from(bool value);
526
+ std::string string_from(const std::vector<int> & values);
527
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
528
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
529
+
530
+ //
531
+ // Filesystem utils
532
+ //
533
+
534
+ bool fs_validate_filename(const std::string & filename);
535
+ bool fs_create_directory_with_parents(const std::string & path);
536
+
537
+ std::string fs_get_cache_directory();
538
+ std::string fs_get_cache_file(const std::string & filename);
539
+
540
+ //
541
+ // Model utils
542
+ //
543
+
544
+ // note: defines object's lifetime
545
+ struct common_init_result {
546
+ llama_model_ptr model;
547
+ llama_context_ptr context;
548
+
549
+ std::vector<llama_adapter_lora_ptr> lora;
550
+ };
551
+
552
+ struct common_init_result common_init_from_params(common_params & params);
553
+
554
+ struct llama_model_params common_model_params_to_llama ( common_params & params);
555
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
556
+ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
557
+
558
+ // clear LoRA adapters from context, then apply new list of adapters
559
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
560
+
561
+ //
562
+ // Batch utils
563
+ //
564
+
565
+ void common_batch_clear(struct llama_batch & batch);
566
+
567
+ void common_batch_add(
568
+ struct llama_batch & batch,
569
+ llama_token id,
570
+ llama_pos pos,
571
+ const std::vector<llama_seq_id> & seq_ids,
572
+ bool logits);
573
+
574
+ //
575
+ // Token utils
576
+ //
577
+
578
+ // longest common prefix
579
+ size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
580
+
581
+ // longet common subsequence
582
+ size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
583
+
584
+ //
585
+ // Vocab utils
586
+ //
587
+
588
+ // tokenizes a string into a vector of tokens
589
+ // should work similar to Python's `tokenizer.encode`
590
+ std::vector<llama_token> common_tokenize(
591
+ const struct llama_context * ctx,
592
+ const std::string & text,
593
+ bool add_special,
594
+ bool parse_special = false);
595
+
596
+ std::vector<llama_token> common_tokenize(
597
+ const struct llama_vocab * vocab,
598
+ const std::string & text,
599
+ bool add_special,
600
+ bool parse_special = false);
601
+
602
+ // tokenizes a token into a piece, optionally renders special/control tokens
603
+ // should work similar to Python's `tokenizer.id_to_piece`
604
+ std::string common_token_to_piece(
605
+ const struct llama_context * ctx,
606
+ llama_token token,
607
+ bool special = true);
608
+
609
+ std::string common_token_to_piece(
610
+ const struct llama_vocab * vocab,
611
+ llama_token token,
612
+ bool special = true);
613
+
614
+ // detokenizes a vector of tokens into a string
615
+ // should work similar to Python's `tokenizer.decode`
616
+ // optionally renders special/control tokens
617
+ std::string common_detokenize(
618
+ const struct llama_context * ctx,
619
+ const std::vector<llama_token> & tokens,
620
+ bool special = true);
621
+
622
+ std::string common_detokenize(
623
+ const struct llama_vocab * vocab,
624
+ const std::vector<llama_token> & tokens,
625
+ bool special = true);
626
+
627
+ //
628
+ // KV cache utils
629
+ //
630
+
631
+ // Dump the KV cache view with the number of sequences per cell.
632
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
633
+
634
+ // Dump the KV cache view showing individual sequences in each cell (long output).
635
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
636
+
637
+ //
638
+ // Embedding utils
639
+ //
640
+
641
+ // TODO: repace embd_norm with an enum
642
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
643
+
644
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
645
+
646
+ //
647
+ // Control vector utils
648
+ //
649
+
650
+ struct common_control_vector_data {
651
+ int n_embd;
652
+
653
+ // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
654
+ std::vector<float> data;
655
+ };
656
+
657
+ struct common_control_vector_load_info {
658
+ float strength;
659
+
660
+ std::string fname;
661
+ };
662
+
663
+ // Load control vectors, scale each by strength, and add them together.
664
+ // On error, returns {-1, empty}
665
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
666
+
667
+ //
668
+ // Split utils
669
+ //
670
+
671
+ namespace {
672
+
673
+ const char * const LLM_KV_SPLIT_NO = "split.no";
674
+ const char * const LLM_KV_SPLIT_COUNT = "split.count";
675
+ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
676
+
677
+ }