cui-llama.rn 1.4.4 → 1.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/android/src/main/CMakeLists.txt +2 -2
  2. package/android/src/main/jni.cpp +12 -10
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/chat-template.hpp +529 -529
  12. package/cpp/chat.cpp +959 -265
  13. package/cpp/chat.h +135 -0
  14. package/cpp/common.cpp +2064 -1996
  15. package/cpp/common.h +700 -744
  16. package/cpp/ggml-alloc.c +1039 -1030
  17. package/cpp/ggml-alloc.h +1 -1
  18. package/cpp/ggml-backend-impl.h +255 -255
  19. package/cpp/ggml-backend-reg.cpp +586 -582
  20. package/cpp/ggml-backend.cpp +2004 -2002
  21. package/cpp/ggml-backend.h +354 -354
  22. package/cpp/ggml-common.h +1851 -1851
  23. package/cpp/ggml-cpp.h +39 -39
  24. package/cpp/ggml-cpu-aarch64.cpp +4248 -4247
  25. package/cpp/ggml-cpu-aarch64.h +8 -8
  26. package/cpp/ggml-cpu-impl.h +531 -380
  27. package/cpp/ggml-cpu-quants.c +12527 -11517
  28. package/cpp/ggml-cpu-traits.cpp +36 -36
  29. package/cpp/ggml-cpu-traits.h +38 -38
  30. package/cpp/ggml-cpu.c +15766 -14485
  31. package/cpp/ggml-cpu.cpp +655 -633
  32. package/cpp/ggml-cpu.h +138 -135
  33. package/cpp/ggml-impl.h +567 -567
  34. package/cpp/ggml-metal-impl.h +235 -0
  35. package/cpp/ggml-metal.h +66 -66
  36. package/cpp/ggml-metal.m +5146 -5002
  37. package/cpp/ggml-opt.cpp +854 -854
  38. package/cpp/ggml-opt.h +216 -216
  39. package/cpp/ggml-quants.c +5238 -5238
  40. package/cpp/ggml-threading.h +14 -14
  41. package/cpp/ggml.c +6529 -6524
  42. package/cpp/ggml.h +2198 -2194
  43. package/cpp/gguf.cpp +1329 -1329
  44. package/cpp/gguf.h +202 -202
  45. package/cpp/json-schema-to-grammar.cpp +1024 -1025
  46. package/cpp/json-schema-to-grammar.h +21 -22
  47. package/cpp/json.hpp +24766 -24766
  48. package/cpp/llama-adapter.cpp +347 -347
  49. package/cpp/llama-adapter.h +74 -74
  50. package/cpp/llama-arch.cpp +1513 -1492
  51. package/cpp/llama-arch.h +403 -402
  52. package/cpp/llama-batch.cpp +368 -368
  53. package/cpp/llama-batch.h +88 -88
  54. package/cpp/llama-chat.cpp +588 -587
  55. package/cpp/llama-chat.h +53 -53
  56. package/cpp/llama-context.cpp +1775 -1775
  57. package/cpp/llama-context.h +128 -128
  58. package/cpp/llama-cparams.cpp +1 -1
  59. package/cpp/llama-cparams.h +37 -37
  60. package/cpp/llama-cpp.h +30 -30
  61. package/cpp/llama-grammar.cpp +1219 -1219
  62. package/cpp/llama-grammar.h +173 -164
  63. package/cpp/llama-hparams.cpp +71 -71
  64. package/cpp/llama-hparams.h +139 -139
  65. package/cpp/llama-impl.cpp +167 -167
  66. package/cpp/llama-impl.h +61 -61
  67. package/cpp/llama-kv-cache.cpp +718 -718
  68. package/cpp/llama-kv-cache.h +219 -218
  69. package/cpp/llama-mmap.cpp +600 -590
  70. package/cpp/llama-mmap.h +68 -68
  71. package/cpp/llama-model-loader.cpp +1124 -1124
  72. package/cpp/llama-model-loader.h +167 -167
  73. package/cpp/llama-model.cpp +4087 -4023
  74. package/cpp/llama-model.h +370 -370
  75. package/cpp/llama-sampling.cpp +2558 -2525
  76. package/cpp/llama-sampling.h +32 -32
  77. package/cpp/llama-vocab.cpp +3264 -3252
  78. package/cpp/llama-vocab.h +125 -125
  79. package/cpp/llama.cpp +10284 -10137
  80. package/cpp/llama.h +1354 -1340
  81. package/cpp/log.cpp +393 -423
  82. package/cpp/log.h +132 -132
  83. package/cpp/minja/chat-template.hpp +529 -0
  84. package/cpp/minja/minja.hpp +2915 -0
  85. package/cpp/minja.hpp +2915 -2883
  86. package/cpp/rn-llama.cpp +20 -37
  87. package/cpp/rn-llama.h +12 -2
  88. package/cpp/sampling.cpp +570 -532
  89. package/cpp/sgemm.cpp +2598 -2598
  90. package/cpp/sgemm.h +14 -14
  91. package/cpp/speculative.cpp +278 -277
  92. package/cpp/speculative.h +28 -28
  93. package/package.json +1 -1
  94. package/android/src/main/build-arm64/CMakeCache.txt +0 -429
  95. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  96. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +0 -101
  97. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  98. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  99. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  100. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  101. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  102. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  103. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  104. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -431
  105. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +0 -16
  106. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +0 -165
  107. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +0 -297
  108. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +0 -1
  109. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +0 -1
  110. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +0 -1
  111. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +0 -1
  112. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +0 -1
  113. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +0 -1
  114. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +0 -1
  115. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +0 -8
  116. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +0 -1
  117. package/android/src/main/build-arm64/CMakeFiles/progress.marks +0 -1
  118. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  119. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +0 -58
  120. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  121. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +0 -756
  122. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  123. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +0 -709
  124. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  125. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +0 -714
  126. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  127. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +0 -62
  128. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  129. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +0 -708
  130. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  131. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +0 -113
  132. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  133. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +0 -713
  134. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  135. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +0 -763
  136. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  137. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +0 -61
  138. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  139. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +0 -707
  140. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  141. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +0 -104
  142. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  143. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +0 -714
  144. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  145. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +0 -723
  146. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +0 -62
  147. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +0 -722
  148. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +0 -89
  149. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +0 -2
  150. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +0 -2
  151. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +0 -2
  152. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +0 -17
  153. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +0 -41
  154. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +0 -62
  155. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +0 -722
  156. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +0 -89
  157. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +0 -2
  158. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +0 -2
  159. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +0 -2
  160. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +0 -17
  161. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +0 -41
  162. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +0 -62
  163. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +0 -722
  164. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +0 -89
  165. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +0 -2
  166. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +0 -2
  167. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +0 -2
  168. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +0 -17
  169. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +0 -41
  170. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +0 -62
  171. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +0 -722
  172. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +0 -89
  173. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +0 -2
  174. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +0 -2
  175. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +0 -2
  176. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +0 -17
  177. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +0 -41
  178. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +0 -62
  179. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +0 -722
  180. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +0 -89
  181. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +0 -2
  182. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +0 -2
  183. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +0 -2
  184. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +0 -17
  185. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +0 -41
  186. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +0 -62
  187. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +0 -722
  188. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +0 -89
  189. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +0 -2
  190. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +0 -2
  191. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +0 -2
  192. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +0 -17
  193. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +0 -41
  194. package/android/src/main/build-arm64/Makefile +0 -1862
  195. package/android/src/main/build-arm64/cmake_install.cmake +0 -66
  196. package/cpp/chat.hpp +0 -55
  197. package/cpp/rn-llama.hpp +0 -913
package/cpp/common.cpp CHANGED
@@ -1,1996 +1,2064 @@
1
- #if defined(_MSC_VER)
2
- #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3
- #endif
4
-
5
- #include "ggml.h"
6
- #include "gguf.h"
7
-
8
- #include "common.h"
9
- #include "log.h"
10
- // Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
11
- #define JSON_ASSERT LM_GGML_ASSERT
12
- #include "json.hpp"
13
- #include "json-schema-to-grammar.h"
14
- #include "llama.h"
15
- #include "chat.hpp"
16
- #include "chat-template.hpp"
17
-
18
- #include <algorithm>
19
- #include <cinttypes>
20
- #include <climits>
21
- #include <cmath>
22
- #include <codecvt>
23
- #include <cstdarg>
24
- #include <cstring>
25
- #include <ctime>
26
- #include <filesystem>
27
- #include <fstream>
28
- #include <iostream>
29
- #include <iterator>
30
- #include <regex>
31
- #include <sstream>
32
- #include <string>
33
- #include <thread>
34
- #include <unordered_map>
35
- #include <unordered_set>
36
- #include <vector>
37
-
38
- #if defined(__APPLE__) && defined(__MACH__)
39
- #include <sys/types.h>
40
- #include <sys/sysctl.h>
41
- #endif
42
-
43
- #if defined(_WIN32)
44
- #define WIN32_LEAN_AND_MEAN
45
- #ifndef NOMINMAX
46
- # define NOMINMAX
47
- #endif
48
- #include <locale>
49
- #include <windows.h>
50
- #include <fcntl.h>
51
- #include <io.h>
52
- #else
53
- #include <sys/ioctl.h>
54
- #include <sys/stat.h>
55
- #include <unistd.h>
56
- #endif
57
- #if defined(LLAMA_USE_CURL)
58
- #include <curl/curl.h>
59
- #include <curl/easy.h>
60
- #include <future>
61
- #endif
62
-
63
- // build info
64
- int LLAMA_BUILD_NUMBER = 0;
65
- char const *LLAMA_COMMIT = "unknown";
66
- char const *LLAMA_COMPILER = "unknown";
67
- char const *LLAMA_BUILD_TARGET = "unknown";
68
-
69
- #if defined(_MSC_VER)
70
- #pragma warning(disable: 4244 4267) // possible loss of data
71
- #endif
72
-
73
- #if defined(LLAMA_USE_CURL)
74
- #ifdef __linux__
75
- #include <linux/limits.h>
76
- #elif defined(_WIN32)
77
- # if !defined(PATH_MAX)
78
- # define PATH_MAX MAX_PATH
79
- # endif
80
- #else
81
- #include <sys/syslimits.h>
82
- #endif
83
- #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
84
-
85
- //
86
- // CURL utils
87
- //
88
-
89
- using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
90
-
91
- // cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
92
- struct curl_slist_ptr {
93
- struct curl_slist * ptr = nullptr;
94
- ~curl_slist_ptr() {
95
- if (ptr) {
96
- curl_slist_free_all(ptr);
97
- }
98
- }
99
- };
100
- #endif // LLAMA_USE_CURL
101
-
102
- using json = nlohmann::ordered_json;
103
-
104
- //
105
- // CPU utils
106
- //
107
-
108
- int32_t cpu_get_num_physical_cores() {
109
- #ifdef __linux__
110
- // enumerate the set of thread siblings, num entries is num cores
111
- std::unordered_set<std::string> siblings;
112
- for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
113
- std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
114
- + std::to_string(cpu) + "/topology/thread_siblings");
115
- if (!thread_siblings.is_open()) {
116
- break; // no more cpus
117
- }
118
- std::string line;
119
- if (std::getline(thread_siblings, line)) {
120
- siblings.insert(line);
121
- }
122
- }
123
- if (!siblings.empty()) {
124
- return static_cast<int32_t>(siblings.size());
125
- }
126
- #elif defined(__APPLE__) && defined(__MACH__)
127
- int32_t num_physical_cores;
128
- size_t len = sizeof(num_physical_cores);
129
- int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
130
- if (result == 0) {
131
- return num_physical_cores;
132
- }
133
- result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
134
- if (result == 0) {
135
- return num_physical_cores;
136
- }
137
- #elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
138
- // TODO: windows + arm64 + mingw64
139
- unsigned int n_threads_win = std::thread::hardware_concurrency();
140
- unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
141
-
142
- DWORD buffer_size = 0;
143
- if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
144
- if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
145
- return default_threads;
146
- }
147
- }
148
-
149
- std::vector<char> buffer(buffer_size);
150
- if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
151
- return default_threads;
152
- }
153
-
154
- int32_t num_physical_cores = 0;
155
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
156
- while (buffer_size > 0) {
157
- if (info->Relationship == RelationProcessorCore) {
158
- num_physical_cores += info->Processor.GroupCount;
159
- }
160
- buffer_size -= info->Size;
161
- info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
162
- }
163
-
164
- return num_physical_cores > 0 ? num_physical_cores : default_threads;
165
- #endif
166
- unsigned int n_threads = std::thread::hardware_concurrency();
167
- return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
168
- }
169
-
170
- #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
171
- #include <pthread.h>
172
-
173
- static void cpuid(unsigned leaf, unsigned subleaf,
174
- unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
175
- __asm__("movq\t%%rbx,%%rsi\n\t"
176
- "cpuid\n\t"
177
- "xchgq\t%%rbx,%%rsi"
178
- : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
179
- : "0"(leaf), "2"(subleaf));
180
- }
181
-
182
- static int pin_cpu(int cpu) {
183
- cpu_set_t mask;
184
- CPU_ZERO(&mask);
185
- CPU_SET(cpu, &mask);
186
- return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
187
- }
188
-
189
- static bool is_hybrid_cpu(void) {
190
- unsigned eax, ebx, ecx, edx;
191
- cpuid(7, 0, &eax, &ebx, &ecx, &edx);
192
- return !!(edx & (1u << 15));
193
- }
194
-
195
- static bool is_running_on_efficiency_core(void) {
196
- unsigned eax, ebx, ecx, edx;
197
- cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
198
- int intel_atom = 0x20;
199
- int core_type = (eax & 0xff000000u) >> 24;
200
- return core_type == intel_atom;
201
- }
202
-
203
- static int cpu_count_math_cpus(int n_cpu) {
204
- int result = 0;
205
- for (int cpu = 0; cpu < n_cpu; ++cpu) {
206
- if (pin_cpu(cpu)) {
207
- return -1;
208
- }
209
- if (is_running_on_efficiency_core()) {
210
- continue; // efficiency cores harm lockstep threading
211
- }
212
- ++cpu; // hyperthreading isn't useful for linear algebra
213
- ++result;
214
- }
215
- return result;
216
- }
217
-
218
- #endif // __x86_64__ && __linux__
219
-
220
- /**
221
- * Returns number of CPUs on system that are useful for math.
222
- */
223
- int32_t cpu_get_num_math() {
224
- #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
225
- int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
226
- if (n_cpu < 1) {
227
- return cpu_get_num_physical_cores();
228
- }
229
- if (is_hybrid_cpu()) {
230
- cpu_set_t affinity;
231
- if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
232
- int result = cpu_count_math_cpus(n_cpu);
233
- pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
234
- if (result > 0) {
235
- return result;
236
- }
237
- }
238
- }
239
- #endif
240
- return cpu_get_num_physical_cores();
241
- }
242
-
243
- // Helper for setting process priority
244
-
245
- #if defined(_WIN32)
246
-
247
- bool set_process_priority(enum lm_ggml_sched_priority prio) {
248
- if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
249
- return true;
250
- }
251
-
252
- DWORD p = NORMAL_PRIORITY_CLASS;
253
- switch (prio) {
254
- case LM_GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
255
- case LM_GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
256
- case LM_GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
257
- case LM_GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
258
- }
259
-
260
- if (!SetPriorityClass(GetCurrentProcess(), p)) {
261
- LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
262
- return false;
263
- }
264
-
265
- return true;
266
- }
267
-
268
- #else // MacOS and POSIX
269
- #include <sys/types.h>
270
- #include <sys/resource.h>
271
-
272
- bool set_process_priority(enum lm_ggml_sched_priority prio) {
273
- if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
274
- return true;
275
- }
276
-
277
- int p = 0;
278
- switch (prio) {
279
- case LM_GGML_SCHED_PRIO_NORMAL: p = 0; break;
280
- case LM_GGML_SCHED_PRIO_MEDIUM: p = -5; break;
281
- case LM_GGML_SCHED_PRIO_HIGH: p = -10; break;
282
- case LM_GGML_SCHED_PRIO_REALTIME: p = -20; break;
283
- }
284
-
285
- if (!setpriority(PRIO_PROCESS, 0, p)) {
286
- LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
287
- return false;
288
- }
289
- return true;
290
- }
291
-
292
- #endif
293
-
294
- //
295
- // CLI argument parsing
296
- //
297
-
298
-
299
- void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
300
- int32_t n_set = 0;
301
-
302
- if (cpuparams.n_threads < 0) {
303
- // Assuming everything about cpuparams is invalid
304
- if (role_model != nullptr) {
305
- cpuparams = *role_model;
306
- } else {
307
- cpuparams.n_threads = cpu_get_num_math();
308
- }
309
- }
310
-
311
- for (int32_t i = 0; i < LM_GGML_MAX_N_THREADS; i++) {
312
- if (cpuparams.cpumask[i]) {
313
- n_set++;
314
- }
315
- }
316
-
317
- if (n_set && n_set < cpuparams.n_threads) {
318
- // Not enough set bits, may experience performance issues.
319
- LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
320
- }
321
- }
322
-
323
- bool parse_cpu_range(const std::string & range, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
324
- size_t dash_loc = range.find('-');
325
- if (dash_loc == std::string::npos) {
326
- LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
327
- return false;
328
- }
329
-
330
- size_t start_i;
331
- size_t end_i;
332
-
333
- if (dash_loc == 0) {
334
- start_i = 0;
335
- } else {
336
- start_i = std::stoull(range.substr(0, dash_loc));
337
- if (start_i >= LM_GGML_MAX_N_THREADS) {
338
- LOG_ERR("Start index out of bounds!\n");
339
- return false;
340
- }
341
- }
342
-
343
- if (dash_loc == range.length() - 1) {
344
- end_i = LM_GGML_MAX_N_THREADS - 1;
345
- } else {
346
- end_i = std::stoull(range.substr(dash_loc + 1));
347
- if (end_i >= LM_GGML_MAX_N_THREADS) {
348
- LOG_ERR("End index out of bounds!\n");
349
- return false;
350
- }
351
- }
352
-
353
- for (size_t i = start_i; i <= end_i; i++) {
354
- boolmask[i] = true;
355
- }
356
-
357
- return true;
358
- }
359
-
360
- bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
361
- // Discard potential 0x prefix
362
- size_t start_i = 0;
363
- if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
364
- start_i = 2;
365
- }
366
-
367
- size_t num_digits = mask.length() - start_i;
368
- if (num_digits > 128) num_digits = 128;
369
-
370
- size_t end_i = num_digits + start_i;
371
-
372
- for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
373
- char c = mask.at(i);
374
- int8_t id = c;
375
-
376
- if ((c >= '0' && c <= '9')) {
377
- id -= '0';
378
- } else if (c >= 'a' && c <= 'f') {
379
- id -= 'a' - 10;
380
- } else if (c >= 'A' && c <= 'F') {
381
- id -= 'A' - 10;
382
- } else {
383
- LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
384
- return false;
385
- }
386
-
387
- boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
388
- boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
389
- boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
390
- boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
391
- }
392
-
393
- return true;
394
- }
395
-
396
- void common_init() {
397
- llama_log_set([](lm_ggml_log_level level, const char * text, void * /*user_data*/) {
398
- if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
399
- common_log_add(common_log_main(), level, "%s", text);
400
- }
401
- }, NULL);
402
-
403
- #ifdef NDEBUG
404
- const char * build_type = "";
405
- #else
406
- const char * build_type = " (debug)";
407
- #endif
408
-
409
- LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
410
- }
411
-
412
- std::string common_params_get_system_info(const common_params & params) {
413
- std::ostringstream os;
414
-
415
- os << "system_info: n_threads = " << params.cpuparams.n_threads;
416
- if (params.cpuparams_batch.n_threads != -1) {
417
- os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
418
- }
419
- #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
420
- // TODO: windows + arm64 + mingw64
421
- DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
422
- os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
423
- #else
424
- os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
425
- #endif
426
-
427
- return os.str();
428
- }
429
-
430
- //
431
- // String utils
432
- //
433
-
434
- std::string string_format(const char * fmt, ...) {
435
- va_list ap;
436
- va_list ap2;
437
- va_start(ap, fmt);
438
- va_copy(ap2, ap);
439
- int size = vsnprintf(NULL, 0, fmt, ap);
440
- LM_GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
441
- std::vector<char> buf(size + 1);
442
- int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
443
- LM_GGML_ASSERT(size2 == size);
444
- va_end(ap2);
445
- va_end(ap);
446
- return std::string(buf.data(), size);
447
- }
448
-
449
- std::string string_strip(const std::string & str) {
450
- size_t start = 0;
451
- size_t end = str.size();
452
- while (start < end && std::isspace(str[start])) {
453
- start++;
454
- }
455
- while (end > start && std::isspace(str[end - 1])) {
456
- end--;
457
- }
458
- return str.substr(start, end - start);
459
- }
460
-
461
- std::string string_get_sortable_timestamp() {
462
- using clock = std::chrono::system_clock;
463
-
464
- const clock::time_point current_time = clock::now();
465
- const time_t as_time_t = clock::to_time_t(current_time);
466
- char timestamp_no_ns[100];
467
- std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
468
-
469
- const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
470
- current_time.time_since_epoch() % 1000000000).count();
471
- char timestamp_ns[11];
472
- snprintf(timestamp_ns, 11, "%09" PRId64, ns);
473
-
474
- return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
475
- }
476
-
477
- void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
478
- if (search.empty()) {
479
- return;
480
- }
481
- std::string builder;
482
- builder.reserve(s.length());
483
- size_t pos = 0;
484
- size_t last_pos = 0;
485
- while ((pos = s.find(search, last_pos)) != std::string::npos) {
486
- builder.append(s, last_pos, pos - last_pos);
487
- builder.append(replace);
488
- last_pos = pos + search.length();
489
- }
490
- builder.append(s, last_pos, std::string::npos);
491
- s = std::move(builder);
492
- }
493
-
494
- std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
495
- std::ostringstream result;
496
- for (size_t i = 0; i < values.size(); ++i) {
497
- if (i > 0) {
498
- result << separator;
499
- }
500
- result << values[i];
501
- }
502
- return result.str();
503
- }
504
-
505
- std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
506
- std::vector<std::string> parts;
507
- size_t start = 0;
508
- size_t end = str.find(delimiter);
509
-
510
- while (end != std::string::npos) {
511
- parts.push_back(str.substr(start, end - start));
512
- start = end + delimiter.length();
513
- end = str.find(delimiter, start);
514
- }
515
-
516
- parts.push_back(str.substr(start));
517
-
518
- return parts;
519
- }
520
-
521
- std::string string_repeat(const std::string & str, size_t n) {
522
- if (n == 0) {
523
- return "";
524
- }
525
-
526
- std::string result;
527
- result.reserve(str.length() * n);
528
-
529
- for (size_t i = 0; i < n; ++i) {
530
- result += str;
531
- }
532
-
533
- return result;
534
- }
535
-
536
- std::string string_from(bool value) {
537
- return value ? "true" : "false";
538
- }
539
-
540
- std::string string_from(const std::vector<int> & values) {
541
- std::stringstream buf;
542
-
543
- buf << "[ ";
544
- bool first = true;
545
- for (auto e : values) {
546
- if (first) {
547
- first = false;
548
- } else {
549
- buf << ", ";
550
- }
551
- buf << std::to_string(e);
552
- }
553
- buf << " ]";
554
-
555
- return buf.str();
556
- }
557
-
558
- std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
559
- std::stringstream buf;
560
-
561
- buf << "[ ";
562
-
563
- bool first = true;
564
- for (const auto & token : tokens) {
565
- if (!first) {
566
- buf << ", ";
567
- } else {
568
- first = false;
569
- }
570
-
571
- auto detokenized = common_token_to_piece(ctx, token);
572
-
573
- detokenized.erase(
574
- std::remove_if(
575
- detokenized.begin(),
576
- detokenized.end(),
577
- [](const unsigned char c) { return !std::isprint(c); }),
578
- detokenized.end());
579
-
580
- buf << "'" << detokenized << "'"
581
- << ":" << std::to_string(token);
582
- }
583
-
584
- buf << " ]";
585
-
586
- return buf.str();
587
- }
588
-
589
- std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
590
- std::stringstream buf;
591
-
592
- buf << "[ ";
593
-
594
- bool first = true;
595
- for (int i = 0; i < batch.n_tokens; ++i) {
596
- if (!first) {
597
- buf << ", ";
598
- } else {
599
- first = false;
600
- }
601
-
602
- auto detokenized = common_token_to_piece(ctx, batch.token[i]);
603
-
604
- detokenized.erase(
605
- std::remove_if(
606
- detokenized.begin(),
607
- detokenized.end(),
608
- [](const unsigned char c) { return !std::isprint(c); }),
609
- detokenized.end());
610
-
611
- buf << "\n" << std::to_string(i)
612
- << ", token '" << detokenized << "'"
613
- << ", pos " << std::to_string(batch.pos[i])
614
- << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
615
- << ", seq_id " << std::to_string(batch.seq_id[i][0])
616
- << ", logits " << std::to_string(batch.logits[i]);
617
- }
618
-
619
- buf << " ]";
620
-
621
- return buf.str();
622
- }
623
-
624
- void string_process_escapes(std::string & input) {
625
- std::size_t input_len = input.length();
626
- std::size_t output_idx = 0;
627
-
628
- for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
629
- if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
630
- switch (input[++input_idx]) {
631
- case 'n': input[output_idx++] = '\n'; break;
632
- case 'r': input[output_idx++] = '\r'; break;
633
- case 't': input[output_idx++] = '\t'; break;
634
- case '\'': input[output_idx++] = '\''; break;
635
- case '\"': input[output_idx++] = '\"'; break;
636
- case '\\': input[output_idx++] = '\\'; break;
637
- case 'x':
638
- // Handle \x12, etc
639
- if (input_idx + 2 < input_len) {
640
- const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
641
- char *err_p = nullptr;
642
- const long val = std::strtol(x, &err_p, 16);
643
- if (err_p == x + 2) {
644
- input_idx += 2;
645
- input[output_idx++] = char(val);
646
- break;
647
- }
648
- }
649
- // fall through
650
- default: input[output_idx++] = '\\';
651
- input[output_idx++] = input[input_idx]; break;
652
- }
653
- } else {
654
- input[output_idx++] = input[input_idx];
655
- }
656
- }
657
-
658
- input.resize(output_idx);
659
- }
660
-
661
- bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
662
- const char * sep = strchr(data, '=');
663
- if (sep == nullptr || sep - data >= 128) {
664
- LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
665
- return false;
666
- }
667
- llama_model_kv_override kvo;
668
- std::strncpy(kvo.key, data, sep - data);
669
- kvo.key[sep - data] = 0;
670
- sep++;
671
- if (strncmp(sep, "int:", 4) == 0) {
672
- sep += 4;
673
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
674
- kvo.val_i64 = std::atol(sep);
675
- } else if (strncmp(sep, "float:", 6) == 0) {
676
- sep += 6;
677
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
678
- kvo.val_f64 = std::atof(sep);
679
- } else if (strncmp(sep, "bool:", 5) == 0) {
680
- sep += 5;
681
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
682
- if (std::strcmp(sep, "true") == 0) {
683
- kvo.val_bool = true;
684
- } else if (std::strcmp(sep, "false") == 0) {
685
- kvo.val_bool = false;
686
- } else {
687
- LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
688
- return false;
689
- }
690
- } else if (strncmp(sep, "str:", 4) == 0) {
691
- sep += 4;
692
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
693
- if (strlen(sep) > 127) {
694
- LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
695
- return false;
696
- }
697
- strncpy(kvo.val_str, sep, 127);
698
- kvo.val_str[127] = '\0';
699
- } else {
700
- LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
701
- return false;
702
- }
703
- overrides.emplace_back(std::move(kvo));
704
- return true;
705
- }
706
-
707
- //
708
- // Filesystem utils
709
- //
710
-
711
- // Validate if a filename is safe to use
712
- // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
713
- bool fs_validate_filename(const std::string & filename) {
714
- if (!filename.length()) {
715
- // Empty filename invalid
716
- return false;
717
- }
718
- if (filename.length() > 255) {
719
- // Limit at common largest possible filename on Linux filesystems
720
- // to avoid unnecessary further validation
721
- // (On systems with smaller limits it will be caught by the OS)
722
- return false;
723
- }
724
-
725
- std::u32string filename_utf32;
726
- try {
727
- #if defined(__clang__)
728
- // disable C++17 deprecation warning for std::codecvt_utf8
729
- # pragma clang diagnostic push
730
- # pragma clang diagnostic ignored "-Wdeprecated-declarations"
731
- #endif
732
- std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
733
-
734
- #if defined(__clang__)
735
- # pragma clang diagnostic pop
736
- #endif
737
-
738
- filename_utf32 = converter.from_bytes(filename);
739
-
740
- // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
741
- // or invalid encodings were encountered. Reject such attempts
742
- std::string filename_reencoded = converter.to_bytes(filename_utf32);
743
- if (filename_reencoded != filename) {
744
- return false;
745
- }
746
- } catch (const std::exception &) {
747
- return false;
748
- }
749
-
750
- // Check for forbidden codepoints:
751
- // - Control characters
752
- // - Unicode equivalents of illegal characters
753
- // - UTF-16 surrogate pairs
754
- // - UTF-8 replacement character
755
- // - Byte order mark (BOM)
756
- // - Illegal characters: / \ : * ? " < > |
757
- for (char32_t c : filename_utf32) {
758
- if (c <= 0x1F // Control characters (C0)
759
- || c == 0x7F // Control characters (DEL)
760
- || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
761
- || c == 0xFF0E // Fullwidth Full Stop (period equivalent)
762
- || c == 0x2215 // Division Slash (forward slash equivalent)
763
- || c == 0x2216 // Set Minus (backslash equivalent)
764
- || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
765
- || c == 0xFFFD // Replacement Character (UTF-8)
766
- || c == 0xFEFF // Byte Order Mark (BOM)
767
- || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
768
- || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
769
- return false;
770
- }
771
- }
772
-
773
- // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
774
- // Unicode and other whitespace is not affected, only 0x20 space
775
- if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
776
- return false;
777
- }
778
-
779
- // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
780
- if (filename.find("..") != std::string::npos) {
781
- return false;
782
- }
783
-
784
- // Reject "."
785
- if (filename == ".") {
786
- return false;
787
- }
788
-
789
- return true;
790
- }
791
-
792
- // returns true if successful, false otherwise
793
- bool fs_create_directory_with_parents(const std::string & path) {
794
- #ifdef _WIN32
795
- std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
796
- std::wstring wpath = converter.from_bytes(path);
797
-
798
- // if the path already exists, check whether it's a directory
799
- const DWORD attributes = GetFileAttributesW(wpath.c_str());
800
- if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
801
- return true;
802
- }
803
-
804
- size_t pos_slash = 0;
805
-
806
- // process path from front to back, procedurally creating directories
807
- while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
808
- const std::wstring subpath = wpath.substr(0, pos_slash);
809
- const wchar_t * test = subpath.c_str();
810
-
811
- const bool success = CreateDirectoryW(test, NULL);
812
- if (!success) {
813
- const DWORD error = GetLastError();
814
-
815
- // if the path already exists, ensure that it's a directory
816
- if (error == ERROR_ALREADY_EXISTS) {
817
- const DWORD attributes = GetFileAttributesW(subpath.c_str());
818
- if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
819
- return false;
820
- }
821
- } else {
822
- return false;
823
- }
824
- }
825
-
826
- pos_slash += 1;
827
- }
828
-
829
- return true;
830
- #else
831
- // if the path already exists, check whether it's a directory
832
- struct stat info;
833
- if (stat(path.c_str(), &info) == 0) {
834
- return S_ISDIR(info.st_mode);
835
- }
836
-
837
- size_t pos_slash = 1; // skip leading slashes for directory creation
838
-
839
- // process path from front to back, procedurally creating directories
840
- while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
841
- const std::string subpath = path.substr(0, pos_slash);
842
- struct stat info;
843
-
844
- // if the path already exists, ensure that it's a directory
845
- if (stat(subpath.c_str(), &info) == 0) {
846
- if (!S_ISDIR(info.st_mode)) {
847
- return false;
848
- }
849
- } else {
850
- // create parent directories
851
- const int ret = mkdir(subpath.c_str(), 0755);
852
- if (ret != 0) {
853
- return false;
854
- }
855
- }
856
-
857
- pos_slash += 1;
858
- }
859
-
860
- return true;
861
- #endif // _WIN32
862
- }
863
-
864
- std::string fs_get_cache_directory() {
865
- std::string cache_directory = "";
866
- auto ensure_trailing_slash = [](std::string p) {
867
- // Make sure to add trailing slash
868
- if (p.back() != DIRECTORY_SEPARATOR) {
869
- p += DIRECTORY_SEPARATOR;
870
- }
871
- return p;
872
- };
873
- if (getenv("LLAMA_CACHE")) {
874
- cache_directory = std::getenv("LLAMA_CACHE");
875
- } else {
876
- #ifdef __linux__
877
- if (std::getenv("XDG_CACHE_HOME")) {
878
- cache_directory = std::getenv("XDG_CACHE_HOME");
879
- } else {
880
- cache_directory = std::getenv("HOME") + std::string("/.cache/");
881
- }
882
- #elif defined(__APPLE__)
883
- cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
884
- #elif defined(_WIN32)
885
- cache_directory = std::getenv("LOCALAPPDATA");
886
- #endif // __linux__
887
- cache_directory = ensure_trailing_slash(cache_directory);
888
- cache_directory += "llama.cpp";
889
- }
890
- return ensure_trailing_slash(cache_directory);
891
- }
892
-
893
- std::string fs_get_cache_file(const std::string & filename) {
894
- LM_GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
895
- std::string cache_directory = fs_get_cache_directory();
896
- const bool success = fs_create_directory_with_parents(cache_directory);
897
- if (!success) {
898
- throw std::runtime_error("failed to create cache directory: " + cache_directory);
899
- }
900
- return cache_directory + filename;
901
- }
902
-
903
-
904
- //
905
- // Model utils
906
- //
907
- struct common_init_result common_init_from_params(common_params & params) {
908
- common_init_result iparams;
909
- auto mparams = common_model_params_to_llama(params);
910
-
911
- llama_model * model = nullptr;
912
-
913
- if (!params.hf_repo.empty() && !params.hf_file.empty()) {
914
- model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
915
- } else if (!params.model_url.empty()) {
916
- model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
917
- } else {
918
- model = llama_model_load_from_file(params.model.c_str(), mparams);
919
- }
920
-
921
- if (model == NULL) {
922
- LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
923
- return iparams;
924
- }
925
-
926
- const llama_vocab * vocab = llama_model_get_vocab(model);
927
-
928
- if (params.reranking) {
929
- bool ok = true;
930
-
931
- if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
932
- LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
933
- ok = false;
934
- }
935
-
936
- if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
937
- LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
938
- ok = false;
939
- }
940
-
941
- if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
942
- LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
943
- ok = false;
944
- }
945
-
946
- if (!ok) {
947
- llama_model_free(model);
948
-
949
- return iparams;
950
- }
951
- }
952
-
953
- auto cparams = common_context_params_to_llama(params);
954
-
955
- llama_context * lctx = llama_init_from_model(model, cparams);
956
- if (lctx == NULL) {
957
- LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
958
- llama_model_free(model);
959
- return iparams;
960
- }
961
-
962
- if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
963
- LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
964
- params.ctx_shift = false;
965
- }
966
-
967
- if (!params.control_vectors.empty()) {
968
- if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
969
- if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
970
-
971
- const auto cvec = common_control_vector_load(params.control_vectors);
972
- if (cvec.n_embd == -1) {
973
- llama_free(lctx);
974
- llama_model_free(model);
975
-
976
- return iparams;
977
- }
978
-
979
- int err = llama_apply_adapter_cvec(
980
- lctx,
981
- cvec.data.data(),
982
- cvec.data.size(),
983
- cvec.n_embd,
984
- params.control_vector_layer_start,
985
- params.control_vector_layer_end);
986
- if (err) {
987
- llama_free(lctx);
988
- llama_model_free(model);
989
-
990
- return iparams;
991
- }
992
- }
993
-
994
- // load and optionally apply lora adapters
995
- for (auto & la : params.lora_adapters) {
996
- llama_adapter_lora_ptr lora;
997
- lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
998
- if (lora == nullptr) {
999
- LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
1000
- llama_free(lctx);
1001
- llama_model_free(model);
1002
- return iparams;
1003
- }
1004
-
1005
- la.ptr = lora.get();
1006
- iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
1007
- }
1008
-
1009
- if (!params.lora_init_without_apply) {
1010
- common_set_adapter_lora(lctx, params.lora_adapters);
1011
- }
1012
-
1013
- if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
1014
- LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
1015
- params.sampling.ignore_eos = false;
1016
- }
1017
-
1018
- if (params.sampling.ignore_eos) {
1019
- for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1020
- if (llama_vocab_is_eog(vocab, i)) {
1021
- LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1022
- params.sampling.logit_bias.push_back({i, -INFINITY});
1023
- }
1024
- }
1025
- }
1026
-
1027
- if (params.sampling.penalty_last_n == -1) {
1028
- LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1029
- params.sampling.penalty_last_n = llama_n_ctx(lctx);
1030
- }
1031
-
1032
- if (params.sampling.dry_penalty_last_n == -1) {
1033
- LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1034
- params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
1035
- }
1036
-
1037
- if (params.warmup) {
1038
- LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
1039
-
1040
- std::vector<llama_token> tmp;
1041
- llama_token bos = llama_vocab_bos(vocab);
1042
- llama_token eos = llama_vocab_eos(vocab);
1043
-
1044
- // some models (e.g. T5) don't have a BOS token
1045
- if (bos != LLAMA_TOKEN_NULL) {
1046
- tmp.push_back(bos);
1047
- }
1048
- if (eos != LLAMA_TOKEN_NULL) {
1049
- tmp.push_back(eos);
1050
- }
1051
- if (tmp.empty()) {
1052
- tmp.push_back(0);
1053
- }
1054
-
1055
- if (llama_model_has_encoder(model)) {
1056
- llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
1057
- llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
1058
- if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
1059
- decoder_start_token_id = bos;
1060
- }
1061
- tmp.clear();
1062
- tmp.push_back(decoder_start_token_id);
1063
- }
1064
- if (llama_model_has_decoder(model)) {
1065
- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
1066
- }
1067
- llama_kv_cache_clear(lctx);
1068
- llama_synchronize(lctx);
1069
- llama_perf_context_reset(lctx);
1070
- }
1071
-
1072
- iparams.model.reset(model);
1073
- iparams.context.reset(lctx);
1074
-
1075
- return iparams;
1076
- }
1077
-
1078
- void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
1079
- llama_clear_adapter_lora(ctx);
1080
- for (auto & la : lora) {
1081
- if (la.scale != 0.0f) {
1082
- llama_set_adapter_lora(ctx, la.ptr, la.scale);
1083
- }
1084
- }
1085
- }
1086
-
1087
- struct llama_model_params common_model_params_to_llama(common_params & params) {
1088
- auto mparams = llama_model_default_params();
1089
-
1090
- if (!params.devices.empty()) {
1091
- mparams.devices = params.devices.data();
1092
- }
1093
- if (params.n_gpu_layers != -1) {
1094
- mparams.n_gpu_layers = params.n_gpu_layers;
1095
- }
1096
-
1097
- mparams.progress_callback_user_data = params.progress_callback_user_data;
1098
- mparams.progress_callback = params.progress_callback;
1099
- mparams.vocab_only = params.vocab_only;
1100
- mparams.main_gpu = params.main_gpu;
1101
- mparams.split_mode = params.split_mode;
1102
- mparams.tensor_split = params.tensor_split;
1103
- mparams.use_mmap = params.use_mmap;
1104
- mparams.use_mlock = params.use_mlock;
1105
- mparams.check_tensors = params.check_tensors;
1106
- if (params.kv_overrides.empty()) {
1107
- mparams.kv_overrides = NULL;
1108
- } else {
1109
- LM_GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
1110
- mparams.kv_overrides = params.kv_overrides.data();
1111
- }
1112
-
1113
- return mparams;
1114
- }
1115
-
1116
- struct llama_context_params common_context_params_to_llama(const common_params & params) {
1117
- auto cparams = llama_context_default_params();
1118
-
1119
- cparams.n_ctx = params.n_ctx;
1120
- cparams.n_seq_max = params.n_parallel;
1121
- cparams.n_batch = params.n_batch;
1122
- cparams.n_ubatch = params.n_ubatch;
1123
- cparams.n_threads = params.cpuparams.n_threads;
1124
- cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1125
- params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1126
- cparams.logits_all = params.logits_all;
1127
- cparams.embeddings = params.embedding;
1128
- cparams.rope_scaling_type = params.rope_scaling_type;
1129
- cparams.rope_freq_base = params.rope_freq_base;
1130
- cparams.rope_freq_scale = params.rope_freq_scale;
1131
- cparams.yarn_ext_factor = params.yarn_ext_factor;
1132
- cparams.yarn_attn_factor = params.yarn_attn_factor;
1133
- cparams.yarn_beta_fast = params.yarn_beta_fast;
1134
- cparams.yarn_beta_slow = params.yarn_beta_slow;
1135
- cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1136
- cparams.pooling_type = params.pooling_type;
1137
- cparams.attention_type = params.attention_type;
1138
- cparams.defrag_thold = params.defrag_thold;
1139
- cparams.cb_eval = params.cb_eval;
1140
- cparams.cb_eval_user_data = params.cb_eval_user_data;
1141
- cparams.offload_kqv = !params.no_kv_offload;
1142
- cparams.flash_attn = params.flash_attn;
1143
- cparams.no_perf = params.no_perf;
1144
-
1145
- if (params.reranking) {
1146
- cparams.embeddings = true;
1147
- cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1148
- }
1149
-
1150
- cparams.type_k = params.cache_type_k;
1151
- cparams.type_v = params.cache_type_v;
1152
-
1153
- return cparams;
1154
- }
1155
-
1156
- struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
1157
- struct lm_ggml_threadpool_params tpp;
1158
-
1159
- lm_ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
1160
-
1161
- if (params.mask_valid) {
1162
- std::memcpy(&tpp.cpumask, &params.cpumask, LM_GGML_MAX_N_THREADS);
1163
- }
1164
-
1165
- tpp.prio = params.priority;
1166
- tpp.poll = params.poll;
1167
- tpp.strict_cpu = params.strict_cpu;
1168
-
1169
- return tpp;
1170
- }
1171
-
1172
- #ifdef LLAMA_USE_CURL
1173
-
1174
- #define CURL_MAX_RETRY 3
1175
- #define CURL_RETRY_DELAY_SECONDS 2
1176
-
1177
- static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
1178
- int remaining_attempts = max_attempts;
1179
-
1180
- while (remaining_attempts > 0) {
1181
- LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
1182
-
1183
- CURLcode res = curl_easy_perform(curl);
1184
- if (res == CURLE_OK) {
1185
- return true;
1186
- }
1187
-
1188
- int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
1189
- LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
1190
-
1191
- remaining_attempts--;
1192
- std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
1193
- }
1194
-
1195
- LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
1196
-
1197
- return false;
1198
- }
1199
-
1200
-
1201
- struct llama_model * common_load_model_from_url(
1202
- const std::string & model_url,
1203
- const std::string & local_path,
1204
- const std::string & hf_token,
1205
- const struct llama_model_params & params) {
1206
- // Basic validation of the model_url
1207
- if (model_url.empty()) {
1208
- LOG_ERR("%s: invalid model_url\n", __func__);
1209
- return NULL;
1210
- }
1211
-
1212
- if (!common_download_file(model_url, local_path, hf_token)) {
1213
- return NULL;
1214
- }
1215
-
1216
- // check for additional GGUFs split to download
1217
- int n_split = 0;
1218
- {
1219
- struct lm_gguf_init_params lm_gguf_params = {
1220
- /*.no_alloc = */ true,
1221
- /*.ctx = */ NULL,
1222
- };
1223
- auto * ctx_gguf = lm_gguf_init_from_file(local_path.c_str(), lm_gguf_params);
1224
- if (!ctx_gguf) {
1225
- LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
1226
- return NULL;
1227
- }
1228
-
1229
- auto key_n_split = lm_gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
1230
- if (key_n_split >= 0) {
1231
- n_split = lm_gguf_get_val_u16(ctx_gguf, key_n_split);
1232
- }
1233
-
1234
- lm_gguf_free(ctx_gguf);
1235
- }
1236
-
1237
- if (n_split > 1) {
1238
- char split_prefix[PATH_MAX] = {0};
1239
- char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1240
-
1241
- // Verify the first split file format
1242
- // and extract split URL and PATH prefixes
1243
- {
1244
- if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
1245
- LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
1246
- return NULL;
1247
- }
1248
-
1249
- if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
1250
- LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
1251
- return NULL;
1252
- }
1253
- }
1254
-
1255
- // Prepare download in parallel
1256
- std::vector<std::future<bool>> futures_download;
1257
- for (int idx = 1; idx < n_split; idx++) {
1258
- futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
1259
- char split_path[PATH_MAX] = {0};
1260
- llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
1261
-
1262
- char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1263
- llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
1264
-
1265
- return common_download_file(split_url, split_path, hf_token);
1266
- }, idx));
1267
- }
1268
-
1269
- // Wait for all downloads to complete
1270
- for (auto & f : futures_download) {
1271
- if (!f.get()) {
1272
- return NULL;
1273
- }
1274
- }
1275
- }
1276
-
1277
- return llama_model_load_from_file(local_path.c_str(), params);
1278
- }
1279
-
1280
- struct llama_model * common_load_model_from_hf(
1281
- const std::string & repo,
1282
- const std::string & remote_path,
1283
- const std::string & local_path,
1284
- const std::string & hf_token,
1285
- const struct llama_model_params & params) {
1286
- // construct hugging face model url:
1287
- //
1288
- // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
1289
- // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
1290
- //
1291
- // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
1292
- // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
1293
- //
1294
-
1295
- std::string model_url = "https://huggingface.co/";
1296
- model_url += repo;
1297
- model_url += "/resolve/main/";
1298
- model_url += remote_path;
1299
-
1300
- return common_load_model_from_url(model_url, local_path, hf_token, params);
1301
- }
1302
-
1303
- /**
1304
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
1305
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
1306
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
1307
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
1308
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
1309
- *
1310
- * Return pair of <repo, file> (with "repo" already having tag removed)
1311
- *
1312
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
1313
- */
1314
- std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
1315
- auto parts = string_split<std::string>(hf_repo_with_tag, ':');
1316
- std::string tag = parts.size() > 1 ? parts.back() : "latest";
1317
- std::string hf_repo = parts[0];
1318
- if (string_split<std::string>(hf_repo, '/').size() != 2) {
1319
- throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
1320
- }
1321
-
1322
- // fetch model info from Hugging Face Hub API
1323
- json model_info;
1324
- curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
1325
- curl_slist_ptr http_headers;
1326
- std::string res_str;
1327
- std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
1328
- curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
1329
- curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
1330
- typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
1331
- auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
1332
- static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
1333
- return size * nmemb;
1334
- };
1335
- curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
1336
- curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
1337
- #if defined(_WIN32)
1338
- curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
1339
- #endif
1340
- if (!hf_token.empty()) {
1341
- std::string auth_header = "Authorization: Bearer " + hf_token;
1342
- http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
1343
- }
1344
- // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
1345
- http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
1346
- http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
1347
- curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
1348
-
1349
- CURLcode res = curl_easy_perform(curl.get());
1350
-
1351
- if (res != CURLE_OK) {
1352
- throw std::runtime_error("error: cannot make GET request to HF API");
1353
- }
1354
-
1355
- long res_code;
1356
- curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
1357
- if (res_code == 200) {
1358
- model_info = json::parse(res_str);
1359
- } else if (res_code == 401) {
1360
- throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
1361
- } else {
1362
- throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
1363
- }
1364
-
1365
- // check response
1366
- if (!model_info.contains("ggufFile")) {
1367
- throw std::runtime_error("error: model does not have ggufFile");
1368
- }
1369
- json & lm_gguf_file = model_info.at("ggufFile");
1370
- if (!lm_gguf_file.contains("rfilename")) {
1371
- throw std::runtime_error("error: ggufFile does not have rfilename");
1372
- }
1373
-
1374
- return std::make_pair(hf_repo, lm_gguf_file.at("rfilename"));
1375
- }
1376
-
1377
- #else
1378
-
1379
- struct llama_model * common_load_model_from_url(
1380
- const std::string & /*model_url*/,
1381
- const std::string & /*local_path*/,
1382
- const std::string & /*hf_token*/,
1383
- const struct llama_model_params & /*params*/) {
1384
- LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
1385
- return nullptr;
1386
- }
1387
-
1388
- struct llama_model * common_load_model_from_hf(
1389
- const std::string & /*repo*/,
1390
- const std::string & /*remote_path*/,
1391
- const std::string & /*local_path*/,
1392
- const std::string & /*hf_token*/,
1393
- const struct llama_model_params & /*params*/) {
1394
- LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1395
- return nullptr;
1396
- }
1397
-
1398
- std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
1399
- LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1400
- return std::make_pair("", "");
1401
- }
1402
-
1403
- #endif // LLAMA_USE_CURL
1404
-
1405
- //
1406
- // Batch utils
1407
- //
1408
-
1409
- void common_batch_clear(struct llama_batch & batch) {
1410
- batch.n_tokens = 0;
1411
- }
1412
-
1413
- void common_batch_add(
1414
- struct llama_batch & batch,
1415
- llama_token id,
1416
- llama_pos pos,
1417
- const std::vector<llama_seq_id> & seq_ids,
1418
- bool logits) {
1419
- LM_GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
1420
-
1421
- batch.token [batch.n_tokens] = id;
1422
- batch.pos [batch.n_tokens] = pos;
1423
- batch.n_seq_id[batch.n_tokens] = seq_ids.size();
1424
- for (size_t i = 0; i < seq_ids.size(); ++i) {
1425
- batch.seq_id[batch.n_tokens][i] = seq_ids[i];
1426
- }
1427
- batch.logits [batch.n_tokens] = logits;
1428
-
1429
- batch.n_tokens++;
1430
- }
1431
-
1432
- //
1433
- // Token utils
1434
- //
1435
-
1436
- size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
1437
- size_t i;
1438
- for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
1439
-
1440
- return i;
1441
- }
1442
-
1443
- size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
1444
- // check for empty sequences
1445
- if (a.empty() || b.empty()) {
1446
- return 0;
1447
- }
1448
-
1449
- // get the lengths of the input sequences
1450
- size_t a_len = a.size();
1451
- size_t b_len = b.size();
1452
-
1453
- // initialize the maximum length of the longest common subsequence (LCS)
1454
- size_t max_length = 0;
1455
-
1456
- // use two rows instead of a 2D matrix to optimize space
1457
- std::vector<size_t> prev_row(b_len + 1, 0);
1458
- std::vector<size_t> curr_row(b_len + 1, 0);
1459
-
1460
- // iterate through the elements of a
1461
- for (size_t i = 1; i <= a_len; i++) {
1462
- // iterate through the elements of b
1463
- for (size_t j = 1; j <= b_len; j++) {
1464
- // if elements at the current positions match
1465
- if (a[i - 1] == b[j - 1]) {
1466
- // if it's the first element of either sequences, set LCS length to 1
1467
- if (i == 1 || j == 1) {
1468
- curr_row[j] = 1;
1469
- } else {
1470
- // increment LCS length by 1 compared to the previous element
1471
- curr_row[j] = prev_row[j - 1] + 1;
1472
- }
1473
-
1474
- // update max_length if necessary
1475
- if (curr_row[j] > max_length) {
1476
- max_length = curr_row[j];
1477
- }
1478
- } else {
1479
- // reset LCS length if elements don't match
1480
- curr_row[j] = 0;
1481
- }
1482
- }
1483
-
1484
- // update the previous row for the next iteration
1485
- prev_row = curr_row;
1486
- }
1487
-
1488
- // return the maximum length of the LCS
1489
- return max_length;
1490
- }
1491
-
1492
- //
1493
- // Vocab utils
1494
- //
1495
-
1496
- std::vector<llama_token> common_tokenize(
1497
- const struct llama_context * ctx,
1498
- const std::string & text,
1499
- bool add_special,
1500
- bool parse_special) {
1501
- const llama_model * model = llama_get_model(ctx);
1502
- const llama_vocab * vocab = llama_model_get_vocab(model);
1503
- return common_tokenize(vocab, text, add_special, parse_special);
1504
- }
1505
-
1506
- std::vector<llama_token> common_tokenize(
1507
- const struct llama_vocab * vocab,
1508
- const std::string & text,
1509
- bool add_special,
1510
- bool parse_special) {
1511
- // upper limit for the number of tokens
1512
- int n_tokens = text.length() + 2 * add_special;
1513
- std::vector<llama_token> result(n_tokens);
1514
- n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1515
- if (n_tokens < 0) {
1516
- result.resize(-n_tokens);
1517
- int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1518
- LM_GGML_ASSERT(check == -n_tokens);
1519
- } else {
1520
- result.resize(n_tokens);
1521
- }
1522
- return result;
1523
- }
1524
-
1525
- std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1526
- const llama_model * model = llama_get_model(ctx);
1527
- const llama_vocab * vocab = llama_model_get_vocab(model);
1528
- return common_token_to_piece(vocab, token, special);
1529
- }
1530
-
1531
- std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
1532
- std::string piece;
1533
- piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1534
- const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
1535
- if (n_chars < 0) {
1536
- piece.resize(-n_chars);
1537
- int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
1538
- LM_GGML_ASSERT(check == -n_chars);
1539
- }
1540
- else {
1541
- piece.resize(n_chars);
1542
- }
1543
-
1544
- return piece;
1545
- }
1546
-
1547
- std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1548
- const llama_model * model = llama_get_model(ctx);
1549
- const llama_vocab * vocab = llama_model_get_vocab(model);
1550
- return common_detokenize(vocab, tokens, special);
1551
- }
1552
-
1553
- std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
1554
- std::string text;
1555
- text.resize(std::max(text.capacity(), tokens.size()));
1556
- int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1557
- if (n_chars < 0) {
1558
- text.resize(-n_chars);
1559
- n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1560
- LM_GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
1561
- }
1562
-
1563
- text.resize(n_chars);
1564
-
1565
- // NOTE: the original tokenizer decodes bytes after collecting the pieces.
1566
- return text;
1567
- }
1568
-
1569
- //
1570
- // Chat template utils
1571
- //
1572
-
1573
- bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
1574
- if (use_jinja) {
1575
- try {
1576
- auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
1577
- common_chat_inputs inputs;
1578
- inputs.messages = json::array({{
1579
- {"role", "user"},
1580
- {"content", "test"},
1581
- }});
1582
- common_chat_params_init(chat_template, inputs);
1583
- return true;
1584
- } catch (const std::exception & e) {
1585
- LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
1586
- return false;
1587
- }
1588
- }
1589
- llama_chat_message chat[] = {{"user", "test"}};
1590
- const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
1591
- return res >= 0;
1592
- }
1593
-
1594
- std::string common_chat_apply_template(
1595
- const common_chat_template & tmpl,
1596
- const std::vector<common_chat_msg> & msgs,
1597
- bool add_ass,
1598
- bool use_jinja) {
1599
- if (use_jinja) {
1600
- auto messages = json::array();
1601
- for (const auto & msg : msgs) {
1602
- messages.push_back({{"role", msg.role}, {"content", msg.content}});
1603
- }
1604
- common_chat_inputs inputs;
1605
- inputs.messages = messages;
1606
- inputs.add_generation_prompt = add_ass;
1607
- return common_chat_params_init(tmpl, inputs).prompt;
1608
- }
1609
-
1610
- int alloc_size = 0;
1611
- std::vector<llama_chat_message> chat;
1612
- for (const auto & msg : msgs) {
1613
- chat.push_back({msg.role.c_str(), msg.content.c_str()});
1614
- alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
1615
- }
1616
-
1617
- std::vector<char> buf(alloc_size);
1618
-
1619
- // run the first time to get the total output length
1620
- int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1621
-
1622
- // error: chat template is not supported
1623
- if (res < 0) {
1624
- // if the custom "tmpl" is not supported, we throw an error
1625
- // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1626
- throw std::runtime_error("this custom template is not supported");
1627
- }
1628
-
1629
- // if it turns out that our buffer is too small, we resize it
1630
- if ((size_t) res > buf.size()) {
1631
- buf.resize(res);
1632
- res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1633
- }
1634
-
1635
- std::string formatted_chat(buf.data(), res);
1636
- return formatted_chat;
1637
- }
1638
-
1639
- std::string common_chat_format_single(
1640
- const common_chat_template & tmpl,
1641
- const std::vector<common_chat_msg> & past_msg,
1642
- const common_chat_msg & new_msg,
1643
- bool add_ass,
1644
- bool use_jinja) {
1645
- std::ostringstream ss;
1646
- auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
1647
- std::vector<common_chat_msg> chat_new(past_msg);
1648
- // if the past_msg ends with a newline, we must preserve it in the formatted version
1649
- if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
1650
- ss << "\n";
1651
- };
1652
- // format chat with new_msg
1653
- chat_new.push_back(new_msg);
1654
- auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
1655
- // get the diff part
1656
- ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
1657
- return ss.str();
1658
- }
1659
-
1660
- std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
1661
- std::vector<common_chat_msg> msgs = {
1662
- {"system", "You are a helpful assistant", {}},
1663
- {"user", "Hello", {}},
1664
- {"assistant", "Hi there", {}},
1665
- {"user", "How are you?", {}},
1666
- };
1667
- return common_chat_apply_template(tmpl, msgs, true, use_jinja);
1668
- }
1669
-
1670
- #define CHATML_TEMPLATE_SRC \
1671
- "{%- for message in messages -%}\n" \
1672
- " {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
1673
- "{%- endfor -%}\n" \
1674
- "{%- if add_generation_prompt -%}\n" \
1675
- " {{- '<|im_start|>assistant\n' -}}\n" \
1676
- "{%- endif -%}"
1677
-
1678
- common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
1679
- {
1680
- std::string default_template_src;
1681
- std::string template_tool_use_src;
1682
-
1683
- bool has_explicit_template = !chat_template_override.empty();
1684
- if (chat_template_override.empty()) {
1685
- auto str = llama_model_chat_template(model, /* name */ nullptr);
1686
- if (str) {
1687
- default_template_src = str;
1688
- has_explicit_template = true;
1689
- }
1690
- str = llama_model_chat_template(model, /* name */ "tool_use");
1691
- if (str) {
1692
- template_tool_use_src = str;
1693
- has_explicit_template = true;
1694
- }
1695
- } else {
1696
- default_template_src = chat_template_override;
1697
- }
1698
- if (default_template_src.empty() || default_template_src == "chatml") {
1699
- if (!template_tool_use_src.empty()) {
1700
- default_template_src = template_tool_use_src;
1701
- } else {
1702
- default_template_src = CHATML_TEMPLATE_SRC;
1703
- }
1704
- }
1705
- auto vocab = llama_model_get_vocab(model);
1706
- const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
1707
- if (token == LLAMA_TOKEN_NULL) {
1708
- if (default_template_src.find(jinja_variable_name) != std::string::npos
1709
- || template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
1710
- LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
1711
- }
1712
- return std::string();
1713
- } else {
1714
- return common_token_to_piece(vocab, token, true);
1715
- }
1716
- };
1717
- auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
1718
- auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
1719
- try {
1720
- return {
1721
- has_explicit_template,
1722
- std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
1723
- template_tool_use_src.empty()
1724
- ? nullptr
1725
- : std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
1726
- };
1727
- } catch (const std::exception & e) {
1728
- LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
1729
- return {
1730
- has_explicit_template,
1731
- std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
1732
- nullptr,
1733
- };
1734
- }
1735
- }
1736
-
1737
- //
1738
- // KV cache utils
1739
- //
1740
-
1741
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1742
- static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1743
-
1744
- printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1745
- view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1746
-
1747
- llama_kv_cache_view_cell * c_curr = view.cells;
1748
- llama_seq_id * cs_curr = view.cells_sequences;
1749
-
1750
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1751
- if (i % row_size == 0) {
1752
- printf("\n%5d: ", i);
1753
- }
1754
- int seq_count = 0;
1755
- for (int j = 0; j < view.n_seq_max; j++) {
1756
- if (cs_curr[j] >= 0) { seq_count++; }
1757
- }
1758
- putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1759
- }
1760
-
1761
- printf("\n=== Done dumping\n");
1762
- }
1763
-
1764
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1765
- static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1766
-
1767
- printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1768
- view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1769
-
1770
- std::unordered_map<llama_seq_id, size_t> seqs;
1771
- llama_kv_cache_view_cell * c_curr = view.cells;
1772
- llama_seq_id * cs_curr = view.cells_sequences;
1773
-
1774
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1775
- for (int j = 0; j < view.n_seq_max; j++) {
1776
- if (cs_curr[j] < 0) { continue; }
1777
- if (seqs.find(cs_curr[j]) == seqs.end()) {
1778
- if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1779
- const size_t sz = seqs.size();
1780
- seqs[cs_curr[j]] = sz;
1781
- }
1782
- }
1783
- if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1784
- }
1785
-
1786
- printf("=== Sequence legend: ");
1787
- for (const auto & it : seqs) {
1788
- printf("%zu=%d, ", it.second, it.first);
1789
- }
1790
- printf("'+'=other sequence ids");
1791
-
1792
- c_curr = view.cells;
1793
- cs_curr = view.cells_sequences;
1794
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1795
- if (i % row_size == 0) {
1796
- printf("\n%5d: ", i);
1797
- }
1798
- for (int j = 0; j < view.n_seq_max; j++) {
1799
- if (cs_curr[j] >= 0) {
1800
- const auto & it = seqs.find(cs_curr[j]);
1801
- putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1802
- } else {
1803
- putchar('.');
1804
- }
1805
- }
1806
- putchar(' ');
1807
- }
1808
-
1809
- printf("\n=== Done dumping\n");
1810
- }
1811
-
1812
- //
1813
- // Embedding utils
1814
- //
1815
-
1816
- void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1817
- double sum = 0.0;
1818
-
1819
- switch (embd_norm) {
1820
- case -1: // no normalisation
1821
- sum = 1.0;
1822
- break;
1823
- case 0: // max absolute
1824
- for (int i = 0; i < n; i++) {
1825
- if (sum < std::abs(inp[i])) {
1826
- sum = std::abs(inp[i]);
1827
- }
1828
- }
1829
- sum /= 32760.0; // make an int16 range
1830
- break;
1831
- case 2: // euclidean
1832
- for (int i = 0; i < n; i++) {
1833
- sum += inp[i] * inp[i];
1834
- }
1835
- sum = std::sqrt(sum);
1836
- break;
1837
- default: // p-norm (euclidean is p-norm p=2)
1838
- for (int i = 0; i < n; i++) {
1839
- sum += std::pow(std::abs(inp[i]), embd_norm);
1840
- }
1841
- sum = std::pow(sum, 1.0 / embd_norm);
1842
- break;
1843
- }
1844
-
1845
- const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
1846
-
1847
- for (int i = 0; i < n; i++) {
1848
- out[i] = inp[i] * norm;
1849
- }
1850
- }
1851
-
1852
- float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1853
- double sum = 0.0;
1854
- double sum1 = 0.0;
1855
- double sum2 = 0.0;
1856
-
1857
- for (int i = 0; i < n; i++) {
1858
- sum += embd1[i] * embd2[i];
1859
- sum1 += embd1[i] * embd1[i];
1860
- sum2 += embd2[i] * embd2[i];
1861
- }
1862
-
1863
- // Handle the case where one or both vectors are zero vectors
1864
- if (sum1 == 0.0 || sum2 == 0.0) {
1865
- if (sum1 == 0.0 && sum2 == 0.0) {
1866
- return 1.0f; // two zero vectors are similar
1867
- }
1868
- return 0.0f;
1869
- }
1870
-
1871
- return sum / (sqrt(sum1) * sqrt(sum2));
1872
- }
1873
-
1874
- //
1875
- // Control vector utils
1876
- //
1877
-
1878
- static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
1879
- common_control_vector_data result = { -1, {} };
1880
-
1881
- lm_ggml_context * ctx = nullptr;
1882
- struct lm_gguf_init_params meta_lm_gguf_params = {
1883
- /* .no_alloc = */ false,
1884
- /* .ctx = */ &ctx,
1885
- };
1886
- struct lm_gguf_context * ctx_gguf = lm_gguf_init_from_file(load_info.fname.c_str(), meta_lm_gguf_params);
1887
- if (!ctx_gguf) {
1888
- LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
1889
- return result;
1890
- }
1891
-
1892
- int32_t n_tensors = lm_gguf_get_n_tensors(ctx_gguf);
1893
- if (n_tensors == 0) {
1894
- LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
1895
- }
1896
-
1897
- for (int i = 0; i < n_tensors; i++) {
1898
- std::string name = lm_gguf_get_tensor_name(ctx_gguf, i);
1899
-
1900
- int layer_idx = -1;
1901
-
1902
- // split on '.'
1903
- size_t dotpos = name.find('.');
1904
- if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
1905
- try {
1906
- layer_idx = std::stoi(name.substr(dotpos + 1));
1907
- } catch (...) {
1908
- layer_idx = -1;
1909
- }
1910
- }
1911
- if (layer_idx < 0) {
1912
- LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1913
- result.n_embd = -1;
1914
- break;
1915
- } else if (layer_idx == 0) {
1916
- LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1917
- result.n_embd = -1;
1918
- break;
1919
- }
1920
-
1921
- struct lm_ggml_tensor * tensor = lm_ggml_get_tensor(ctx, name.c_str());
1922
- if (tensor->type != LM_GGML_TYPE_F32) {
1923
- LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
1924
- result.n_embd = -1;
1925
- break;
1926
- }
1927
- if (lm_ggml_n_dims(tensor) != 1) {
1928
- LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
1929
- result.n_embd = -1;
1930
- break;
1931
- }
1932
-
1933
- if (result.n_embd == -1) {
1934
- result.n_embd = lm_ggml_nelements(tensor);
1935
- } else if (lm_ggml_nelements(tensor) != result.n_embd) {
1936
- LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
1937
- result.n_embd = -1;
1938
- break;
1939
- }
1940
-
1941
- // extend if necessary - do not store data for layer 0 (it's not used)
1942
- result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
1943
-
1944
- const float * src = (const float *) tensor->data;
1945
- float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
1946
- for (int j = 0; j < result.n_embd; j++) {
1947
- dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
1948
- }
1949
-
1950
- }
1951
-
1952
- if (result.n_embd == -1) {
1953
- LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
1954
- result.data.clear();
1955
- }
1956
-
1957
- lm_gguf_free(ctx_gguf);
1958
- lm_ggml_free(ctx);
1959
-
1960
- return result;
1961
- }
1962
-
1963
- common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
1964
- common_control_vector_data result = { -1, {} };
1965
-
1966
- for (const auto & info : load_infos) {
1967
- auto cur = common_control_vector_load_one(info);
1968
-
1969
- if (cur.n_embd == -1) {
1970
- result.n_embd = -1;
1971
- break;
1972
- }
1973
- if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
1974
- LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
1975
- result.n_embd = -1;
1976
- break;
1977
- }
1978
-
1979
- if (result.n_embd == -1) {
1980
- result = std::move(cur);
1981
- } else {
1982
- result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
1983
- for (size_t i = 0; i < cur.data.size(); i++) {
1984
- result.data[i] += cur.data[i];
1985
- }
1986
- }
1987
- }
1988
-
1989
- if (result.n_embd == -1) {
1990
- LOG_ERR("%s: no valid control vector files passed\n", __func__);
1991
- result.data.clear();
1992
- }
1993
-
1994
- return result;
1995
- }
1996
-
1
+ #if defined(_MSC_VER)
2
+ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3
+ #endif
4
+
5
+ #include "ggml.h"
6
+ #include "gguf.h"
7
+
8
+ #include "common.h"
9
+ #include "log.h"
10
+ // Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
11
+ #define JSON_ASSERT LM_GGML_ASSERT
12
+ #include "json.hpp"
13
+ #include "llama.h"
14
+
15
+ #include <algorithm>
16
+ #include <cinttypes>
17
+ #include <climits>
18
+ #include <cmath>
19
+ #include <codecvt>
20
+ #include <cstdarg>
21
+ #include <cstring>
22
+ #include <ctime>
23
+ #include <filesystem>
24
+ #include <fstream>
25
+ #include <iostream>
26
+ #include <iterator>
27
+ #include <regex>
28
+ #include <sstream>
29
+ #include <string>
30
+ #include <thread>
31
+ #include <unordered_map>
32
+ #include <unordered_set>
33
+ #include <vector>
34
+
35
+ #if defined(__APPLE__) && defined(__MACH__)
36
+ #include <sys/types.h>
37
+ #include <sys/sysctl.h>
38
+ #endif
39
+
40
+ #if defined(_WIN32)
41
+ #define WIN32_LEAN_AND_MEAN
42
+ #ifndef NOMINMAX
43
+ # define NOMINMAX
44
+ #endif
45
+ #include <locale>
46
+ #include <windows.h>
47
+ #include <fcntl.h>
48
+ #include <io.h>
49
+ #else
50
+ #include <sys/ioctl.h>
51
+ #include <sys/stat.h>
52
+ #include <unistd.h>
53
+ #endif
54
+ #if defined(LLAMA_USE_CURL)
55
+ #include <curl/curl.h>
56
+ #include <curl/easy.h>
57
+ #include <future>
58
+ #endif
59
+
60
+ // build info
61
+ int LLAMA_BUILD_NUMBER = 0;
62
+ char const *LLAMA_COMMIT = "unknown";
63
+ char const *LLAMA_COMPILER = "unknown";
64
+ char const *LLAMA_BUILD_TARGET = "unknown";
65
+
66
+ #if defined(_MSC_VER)
67
+ #pragma warning(disable: 4244 4267) // possible loss of data
68
+ #endif
69
+
70
+ #if defined(LLAMA_USE_CURL)
71
+ #ifdef __linux__
72
+ #include <linux/limits.h>
73
+ #elif defined(_WIN32)
74
+ # if !defined(PATH_MAX)
75
+ # define PATH_MAX MAX_PATH
76
+ # endif
77
+ #else
78
+ #include <sys/syslimits.h>
79
+ #endif
80
+ #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
81
+
82
+ //
83
+ // CURL utils
84
+ //
85
+
86
+ using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
87
+
88
+ // cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
89
+ struct curl_slist_ptr {
90
+ struct curl_slist * ptr = nullptr;
91
+ ~curl_slist_ptr() {
92
+ if (ptr) {
93
+ curl_slist_free_all(ptr);
94
+ }
95
+ }
96
+ };
97
+ #endif // LLAMA_USE_CURL
98
+
99
+ using json = nlohmann::ordered_json;
100
+
101
+ //
102
+ // CPU utils
103
+ //
104
+
105
+ int32_t cpu_get_num_physical_cores() {
106
+ #ifdef __linux__
107
+ // enumerate the set of thread siblings, num entries is num cores
108
+ std::unordered_set<std::string> siblings;
109
+ for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
110
+ std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
111
+ + std::to_string(cpu) + "/topology/thread_siblings");
112
+ if (!thread_siblings.is_open()) {
113
+ break; // no more cpus
114
+ }
115
+ std::string line;
116
+ if (std::getline(thread_siblings, line)) {
117
+ siblings.insert(line);
118
+ }
119
+ }
120
+ if (!siblings.empty()) {
121
+ return static_cast<int32_t>(siblings.size());
122
+ }
123
+ #elif defined(__APPLE__) && defined(__MACH__)
124
+ int32_t num_physical_cores;
125
+ size_t len = sizeof(num_physical_cores);
126
+ int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
127
+ if (result == 0) {
128
+ return num_physical_cores;
129
+ }
130
+ result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
131
+ if (result == 0) {
132
+ return num_physical_cores;
133
+ }
134
+ #elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
135
+ // TODO: windows + arm64 + mingw64
136
+ unsigned int n_threads_win = std::thread::hardware_concurrency();
137
+ unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
138
+
139
+ DWORD buffer_size = 0;
140
+ if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
141
+ if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
142
+ return default_threads;
143
+ }
144
+ }
145
+
146
+ std::vector<char> buffer(buffer_size);
147
+ if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
148
+ return default_threads;
149
+ }
150
+
151
+ int32_t num_physical_cores = 0;
152
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
153
+ while (buffer_size > 0) {
154
+ if (info->Relationship == RelationProcessorCore) {
155
+ num_physical_cores += info->Processor.GroupCount;
156
+ }
157
+ buffer_size -= info->Size;
158
+ info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
159
+ }
160
+
161
+ return num_physical_cores > 0 ? num_physical_cores : default_threads;
162
+ #endif
163
+ unsigned int n_threads = std::thread::hardware_concurrency();
164
+ return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
165
+ }
166
+
167
+ #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
168
+ #include <pthread.h>
169
+
170
+ static void cpuid(unsigned leaf, unsigned subleaf,
171
+ unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
172
+ __asm__("movq\t%%rbx,%%rsi\n\t"
173
+ "cpuid\n\t"
174
+ "xchgq\t%%rbx,%%rsi"
175
+ : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
176
+ : "0"(leaf), "2"(subleaf));
177
+ }
178
+
179
+ static int pin_cpu(int cpu) {
180
+ cpu_set_t mask;
181
+ CPU_ZERO(&mask);
182
+ CPU_SET(cpu, &mask);
183
+ return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
184
+ }
185
+
186
+ static bool is_hybrid_cpu(void) {
187
+ unsigned eax, ebx, ecx, edx;
188
+ cpuid(7, 0, &eax, &ebx, &ecx, &edx);
189
+ return !!(edx & (1u << 15));
190
+ }
191
+
192
+ static bool is_running_on_efficiency_core(void) {
193
+ unsigned eax, ebx, ecx, edx;
194
+ cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
195
+ int intel_atom = 0x20;
196
+ int core_type = (eax & 0xff000000u) >> 24;
197
+ return core_type == intel_atom;
198
+ }
199
+
200
+ static int cpu_count_math_cpus(int n_cpu) {
201
+ int result = 0;
202
+ for (int cpu = 0; cpu < n_cpu; ++cpu) {
203
+ if (pin_cpu(cpu)) {
204
+ return -1;
205
+ }
206
+ if (is_running_on_efficiency_core()) {
207
+ continue; // efficiency cores harm lockstep threading
208
+ }
209
+ ++cpu; // hyperthreading isn't useful for linear algebra
210
+ ++result;
211
+ }
212
+ return result;
213
+ }
214
+
215
+ #endif // __x86_64__ && __linux__
216
+
217
+ /**
218
+ * Returns number of CPUs on system that are useful for math.
219
+ */
220
+ int32_t cpu_get_num_math() {
221
+ #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
222
+ int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
223
+ if (n_cpu < 1) {
224
+ return cpu_get_num_physical_cores();
225
+ }
226
+ if (is_hybrid_cpu()) {
227
+ cpu_set_t affinity;
228
+ if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
229
+ int result = cpu_count_math_cpus(n_cpu);
230
+ pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
231
+ if (result > 0) {
232
+ return result;
233
+ }
234
+ }
235
+ }
236
+ #endif
237
+ return cpu_get_num_physical_cores();
238
+ }
239
+
240
+ // Helper for setting process priority
241
+
242
+ #if defined(_WIN32)
243
+
244
+ bool set_process_priority(enum lm_ggml_sched_priority prio) {
245
+ if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
246
+ return true;
247
+ }
248
+
249
+ DWORD p = NORMAL_PRIORITY_CLASS;
250
+ switch (prio) {
251
+ case LM_GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
252
+ case LM_GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
253
+ case LM_GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
254
+ case LM_GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
255
+ }
256
+
257
+ if (!SetPriorityClass(GetCurrentProcess(), p)) {
258
+ LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
259
+ return false;
260
+ }
261
+
262
+ return true;
263
+ }
264
+
265
+ #else // MacOS and POSIX
266
+ #include <sys/types.h>
267
+ #include <sys/resource.h>
268
+
269
+ bool set_process_priority(enum lm_ggml_sched_priority prio) {
270
+ if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
271
+ return true;
272
+ }
273
+
274
+ int p = 0;
275
+ switch (prio) {
276
+ case LM_GGML_SCHED_PRIO_NORMAL: p = 0; break;
277
+ case LM_GGML_SCHED_PRIO_MEDIUM: p = -5; break;
278
+ case LM_GGML_SCHED_PRIO_HIGH: p = -10; break;
279
+ case LM_GGML_SCHED_PRIO_REALTIME: p = -20; break;
280
+ }
281
+
282
+ if (!setpriority(PRIO_PROCESS, 0, p)) {
283
+ LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
284
+ return false;
285
+ }
286
+ return true;
287
+ }
288
+
289
+ #endif
290
+
291
+ //
292
+ // CLI argument parsing
293
+ //
294
+
295
+
296
+ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
297
+ int32_t n_set = 0;
298
+
299
+ if (cpuparams.n_threads < 0) {
300
+ // Assuming everything about cpuparams is invalid
301
+ if (role_model != nullptr) {
302
+ cpuparams = *role_model;
303
+ } else {
304
+ cpuparams.n_threads = cpu_get_num_math();
305
+ }
306
+ }
307
+
308
+ for (int32_t i = 0; i < LM_GGML_MAX_N_THREADS; i++) {
309
+ if (cpuparams.cpumask[i]) {
310
+ n_set++;
311
+ }
312
+ }
313
+
314
+ if (n_set && n_set < cpuparams.n_threads) {
315
+ // Not enough set bits, may experience performance issues.
316
+ LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
317
+ }
318
+ }
319
+
320
+ bool parse_cpu_range(const std::string & range, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
321
+ size_t dash_loc = range.find('-');
322
+ if (dash_loc == std::string::npos) {
323
+ LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
324
+ return false;
325
+ }
326
+
327
+ size_t start_i;
328
+ size_t end_i;
329
+
330
+ if (dash_loc == 0) {
331
+ start_i = 0;
332
+ } else {
333
+ start_i = std::stoull(range.substr(0, dash_loc));
334
+ if (start_i >= LM_GGML_MAX_N_THREADS) {
335
+ LOG_ERR("Start index out of bounds!\n");
336
+ return false;
337
+ }
338
+ }
339
+
340
+ if (dash_loc == range.length() - 1) {
341
+ end_i = LM_GGML_MAX_N_THREADS - 1;
342
+ } else {
343
+ end_i = std::stoull(range.substr(dash_loc + 1));
344
+ if (end_i >= LM_GGML_MAX_N_THREADS) {
345
+ LOG_ERR("End index out of bounds!\n");
346
+ return false;
347
+ }
348
+ }
349
+
350
+ for (size_t i = start_i; i <= end_i; i++) {
351
+ boolmask[i] = true;
352
+ }
353
+
354
+ return true;
355
+ }
356
+
357
+ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THREADS]) {
358
+ // Discard potential 0x prefix
359
+ size_t start_i = 0;
360
+ if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
361
+ start_i = 2;
362
+ }
363
+
364
+ size_t num_digits = mask.length() - start_i;
365
+ if (num_digits > 128) num_digits = 128;
366
+
367
+ size_t end_i = num_digits + start_i;
368
+
369
+ for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
370
+ char c = mask.at(i);
371
+ int8_t id = c;
372
+
373
+ if ((c >= '0' && c <= '9')) {
374
+ id -= '0';
375
+ } else if (c >= 'a' && c <= 'f') {
376
+ id -= 'a' - 10;
377
+ } else if (c >= 'A' && c <= 'F') {
378
+ id -= 'A' - 10;
379
+ } else {
380
+ LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
381
+ return false;
382
+ }
383
+
384
+ boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
385
+ boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
386
+ boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
387
+ boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
388
+ }
389
+
390
+ return true;
391
+ }
392
+
393
+ void common_init() {
394
+ llama_log_set([](lm_ggml_log_level level, const char * text, void * /*user_data*/) {
395
+ if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
396
+ common_log_add(common_log_main(), level, "%s", text);
397
+ }
398
+ }, NULL);
399
+
400
+ #ifdef NDEBUG
401
+ const char * build_type = "";
402
+ #else
403
+ const char * build_type = " (debug)";
404
+ #endif
405
+
406
+ LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
407
+ }
408
+
409
+ std::string common_params_get_system_info(const common_params & params) {
410
+ std::ostringstream os;
411
+
412
+ os << "system_info: n_threads = " << params.cpuparams.n_threads;
413
+ if (params.cpuparams_batch.n_threads != -1) {
414
+ os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
415
+ }
416
+ #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
417
+ // TODO: windows + arm64 + mingw64
418
+ DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
419
+ os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
420
+ #else
421
+ os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
422
+ #endif
423
+
424
+ return os.str();
425
+ }
426
+
427
+ //
428
+ // String utils
429
+ //
430
+
431
+ std::string string_format(const char * fmt, ...) {
432
+ va_list ap;
433
+ va_list ap2;
434
+ va_start(ap, fmt);
435
+ va_copy(ap2, ap);
436
+ int size = vsnprintf(NULL, 0, fmt, ap);
437
+ LM_GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
438
+ std::vector<char> buf(size + 1);
439
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
440
+ LM_GGML_ASSERT(size2 == size);
441
+ va_end(ap2);
442
+ va_end(ap);
443
+ return std::string(buf.data(), size);
444
+ }
445
+
446
+ std::string string_strip(const std::string & str) {
447
+ size_t start = 0;
448
+ size_t end = str.size();
449
+ while (start < end && std::isspace(str[start])) {
450
+ start++;
451
+ }
452
+ while (end > start && std::isspace(str[end - 1])) {
453
+ end--;
454
+ }
455
+ return str.substr(start, end - start);
456
+ }
457
+
458
+ std::string string_get_sortable_timestamp() {
459
+ using clock = std::chrono::system_clock;
460
+
461
+ const clock::time_point current_time = clock::now();
462
+ const time_t as_time_t = clock::to_time_t(current_time);
463
+ char timestamp_no_ns[100];
464
+ std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
465
+
466
+ const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
467
+ current_time.time_since_epoch() % 1000000000).count();
468
+ char timestamp_ns[11];
469
+ snprintf(timestamp_ns, 11, "%09" PRId64, ns);
470
+
471
+ return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
472
+ }
473
+
474
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
475
+ if (search.empty()) {
476
+ return;
477
+ }
478
+ std::string builder;
479
+ builder.reserve(s.length());
480
+ size_t pos = 0;
481
+ size_t last_pos = 0;
482
+ while ((pos = s.find(search, last_pos)) != std::string::npos) {
483
+ builder.append(s, last_pos, pos - last_pos);
484
+ builder.append(replace);
485
+ last_pos = pos + search.length();
486
+ }
487
+ builder.append(s, last_pos, std::string::npos);
488
+ s = std::move(builder);
489
+ }
490
+
491
+ std::string regex_escape(const std::string & s) {
492
+ static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
493
+ return std::regex_replace(s, special_chars, "\\$0");
494
+ }
495
+
496
+ std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
497
+ std::ostringstream result;
498
+ for (size_t i = 0; i < values.size(); ++i) {
499
+ if (i > 0) {
500
+ result << separator;
501
+ }
502
+ result << values[i];
503
+ }
504
+ return result.str();
505
+ }
506
+
507
+ std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
508
+ std::vector<std::string> parts;
509
+ size_t start = 0;
510
+ size_t end = str.find(delimiter);
511
+
512
+ while (end != std::string::npos) {
513
+ parts.push_back(str.substr(start, end - start));
514
+ start = end + delimiter.length();
515
+ end = str.find(delimiter, start);
516
+ }
517
+
518
+ parts.push_back(str.substr(start));
519
+
520
+ return parts;
521
+ }
522
+
523
+ std::string string_repeat(const std::string & str, size_t n) {
524
+ if (n == 0) {
525
+ return "";
526
+ }
527
+
528
+ std::string result;
529
+ result.reserve(str.length() * n);
530
+
531
+ for (size_t i = 0; i < n; ++i) {
532
+ result += str;
533
+ }
534
+
535
+ return result;
536
+ }
537
+
538
+ std::string string_from(bool value) {
539
+ return value ? "true" : "false";
540
+ }
541
+
542
+ std::string string_from(const std::vector<int> & values) {
543
+ std::stringstream buf;
544
+
545
+ buf << "[ ";
546
+ bool first = true;
547
+ for (auto e : values) {
548
+ if (first) {
549
+ first = false;
550
+ } else {
551
+ buf << ", ";
552
+ }
553
+ buf << std::to_string(e);
554
+ }
555
+ buf << " ]";
556
+
557
+ return buf.str();
558
+ }
559
+
560
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
561
+ std::stringstream buf;
562
+
563
+ buf << "[ ";
564
+
565
+ bool first = true;
566
+ for (const auto & token : tokens) {
567
+ if (!first) {
568
+ buf << ", ";
569
+ } else {
570
+ first = false;
571
+ }
572
+
573
+ auto detokenized = common_token_to_piece(ctx, token);
574
+
575
+ detokenized.erase(
576
+ std::remove_if(
577
+ detokenized.begin(),
578
+ detokenized.end(),
579
+ [](const unsigned char c) { return !std::isprint(c); }),
580
+ detokenized.end());
581
+
582
+ buf << "'" << detokenized << "'"
583
+ << ":" << std::to_string(token);
584
+ }
585
+
586
+ buf << " ]";
587
+
588
+ return buf.str();
589
+ }
590
+
591
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
592
+ std::stringstream buf;
593
+
594
+ buf << "[ ";
595
+
596
+ bool first = true;
597
+ for (int i = 0; i < batch.n_tokens; ++i) {
598
+ if (!first) {
599
+ buf << ", ";
600
+ } else {
601
+ first = false;
602
+ }
603
+
604
+ auto detokenized = common_token_to_piece(ctx, batch.token[i]);
605
+
606
+ detokenized.erase(
607
+ std::remove_if(
608
+ detokenized.begin(),
609
+ detokenized.end(),
610
+ [](const unsigned char c) { return !std::isprint(c); }),
611
+ detokenized.end());
612
+
613
+ buf << "\n" << std::to_string(i)
614
+ << ", token '" << detokenized << "'"
615
+ << ", pos " << std::to_string(batch.pos[i])
616
+ << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
617
+ << ", seq_id " << std::to_string(batch.seq_id[i][0])
618
+ << ", logits " << std::to_string(batch.logits[i]);
619
+ }
620
+
621
+ buf << " ]";
622
+
623
+ return buf.str();
624
+ }
625
+
626
+ void string_process_escapes(std::string & input) {
627
+ std::size_t input_len = input.length();
628
+ std::size_t output_idx = 0;
629
+
630
+ for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
631
+ if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
632
+ switch (input[++input_idx]) {
633
+ case 'n': input[output_idx++] = '\n'; break;
634
+ case 'r': input[output_idx++] = '\r'; break;
635
+ case 't': input[output_idx++] = '\t'; break;
636
+ case '\'': input[output_idx++] = '\''; break;
637
+ case '\"': input[output_idx++] = '\"'; break;
638
+ case '\\': input[output_idx++] = '\\'; break;
639
+ case 'x':
640
+ // Handle \x12, etc
641
+ if (input_idx + 2 < input_len) {
642
+ const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
643
+ char *err_p = nullptr;
644
+ const long val = std::strtol(x, &err_p, 16);
645
+ if (err_p == x + 2) {
646
+ input_idx += 2;
647
+ input[output_idx++] = char(val);
648
+ break;
649
+ }
650
+ }
651
+ // fall through
652
+ default: input[output_idx++] = '\\';
653
+ input[output_idx++] = input[input_idx]; break;
654
+ }
655
+ } else {
656
+ input[output_idx++] = input[input_idx];
657
+ }
658
+ }
659
+
660
+ input.resize(output_idx);
661
+ }
662
+
663
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
664
+ const char * sep = strchr(data, '=');
665
+ if (sep == nullptr || sep - data >= 128) {
666
+ LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
667
+ return false;
668
+ }
669
+ llama_model_kv_override kvo;
670
+ std::strncpy(kvo.key, data, sep - data);
671
+ kvo.key[sep - data] = 0;
672
+ sep++;
673
+ if (strncmp(sep, "int:", 4) == 0) {
674
+ sep += 4;
675
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
676
+ kvo.val_i64 = std::atol(sep);
677
+ } else if (strncmp(sep, "float:", 6) == 0) {
678
+ sep += 6;
679
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
680
+ kvo.val_f64 = std::atof(sep);
681
+ } else if (strncmp(sep, "bool:", 5) == 0) {
682
+ sep += 5;
683
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
684
+ if (std::strcmp(sep, "true") == 0) {
685
+ kvo.val_bool = true;
686
+ } else if (std::strcmp(sep, "false") == 0) {
687
+ kvo.val_bool = false;
688
+ } else {
689
+ LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
690
+ return false;
691
+ }
692
+ } else if (strncmp(sep, "str:", 4) == 0) {
693
+ sep += 4;
694
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
695
+ if (strlen(sep) > 127) {
696
+ LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
697
+ return false;
698
+ }
699
+ strncpy(kvo.val_str, sep, 127);
700
+ kvo.val_str[127] = '\0';
701
+ } else {
702
+ LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
703
+ return false;
704
+ }
705
+ overrides.emplace_back(std::move(kvo));
706
+ return true;
707
+ }
708
+
709
+ //
710
+ // Filesystem utils
711
+ //
712
+
713
+ // Validate if a filename is safe to use
714
+ // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
715
+ bool fs_validate_filename(const std::string & filename) {
716
+ if (!filename.length()) {
717
+ // Empty filename invalid
718
+ return false;
719
+ }
720
+ if (filename.length() > 255) {
721
+ // Limit at common largest possible filename on Linux filesystems
722
+ // to avoid unnecessary further validation
723
+ // (On systems with smaller limits it will be caught by the OS)
724
+ return false;
725
+ }
726
+
727
+ std::u32string filename_utf32;
728
+ try {
729
+ #if defined(__clang__)
730
+ // disable C++17 deprecation warning for std::codecvt_utf8
731
+ # pragma clang diagnostic push
732
+ # pragma clang diagnostic ignored "-Wdeprecated-declarations"
733
+ #endif
734
+ std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
735
+
736
+ #if defined(__clang__)
737
+ # pragma clang diagnostic pop
738
+ #endif
739
+
740
+ filename_utf32 = converter.from_bytes(filename);
741
+
742
+ // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
743
+ // or invalid encodings were encountered. Reject such attempts
744
+ std::string filename_reencoded = converter.to_bytes(filename_utf32);
745
+ if (filename_reencoded != filename) {
746
+ return false;
747
+ }
748
+ } catch (const std::exception &) {
749
+ return false;
750
+ }
751
+
752
+ // Check for forbidden codepoints:
753
+ // - Control characters
754
+ // - Unicode equivalents of illegal characters
755
+ // - UTF-16 surrogate pairs
756
+ // - UTF-8 replacement character
757
+ // - Byte order mark (BOM)
758
+ // - Illegal characters: / \ : * ? " < > |
759
+ for (char32_t c : filename_utf32) {
760
+ if (c <= 0x1F // Control characters (C0)
761
+ || c == 0x7F // Control characters (DEL)
762
+ || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
763
+ || c == 0xFF0E // Fullwidth Full Stop (period equivalent)
764
+ || c == 0x2215 // Division Slash (forward slash equivalent)
765
+ || c == 0x2216 // Set Minus (backslash equivalent)
766
+ || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
767
+ || c == 0xFFFD // Replacement Character (UTF-8)
768
+ || c == 0xFEFF // Byte Order Mark (BOM)
769
+ || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
770
+ || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
771
+ return false;
772
+ }
773
+ }
774
+
775
+ // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
776
+ // Unicode and other whitespace is not affected, only 0x20 space
777
+ if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
778
+ return false;
779
+ }
780
+
781
+ // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
782
+ if (filename.find("..") != std::string::npos) {
783
+ return false;
784
+ }
785
+
786
+ // Reject "."
787
+ if (filename == ".") {
788
+ return false;
789
+ }
790
+
791
+ return true;
792
+ }
793
+
794
+ // returns true if successful, false otherwise
795
+ bool fs_create_directory_with_parents(const std::string & path) {
796
+ #ifdef _WIN32
797
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
798
+ std::wstring wpath = converter.from_bytes(path);
799
+
800
+ // if the path already exists, check whether it's a directory
801
+ const DWORD attributes = GetFileAttributesW(wpath.c_str());
802
+ if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
803
+ return true;
804
+ }
805
+
806
+ size_t pos_slash = 0;
807
+
808
+ // process path from front to back, procedurally creating directories
809
+ while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
810
+ const std::wstring subpath = wpath.substr(0, pos_slash);
811
+ const wchar_t * test = subpath.c_str();
812
+
813
+ const bool success = CreateDirectoryW(test, NULL);
814
+ if (!success) {
815
+ const DWORD error = GetLastError();
816
+
817
+ // if the path already exists, ensure that it's a directory
818
+ if (error == ERROR_ALREADY_EXISTS) {
819
+ const DWORD attributes = GetFileAttributesW(subpath.c_str());
820
+ if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
821
+ return false;
822
+ }
823
+ } else {
824
+ return false;
825
+ }
826
+ }
827
+
828
+ pos_slash += 1;
829
+ }
830
+
831
+ return true;
832
+ #else
833
+ // if the path already exists, check whether it's a directory
834
+ struct stat info;
835
+ if (stat(path.c_str(), &info) == 0) {
836
+ return S_ISDIR(info.st_mode);
837
+ }
838
+
839
+ size_t pos_slash = 1; // skip leading slashes for directory creation
840
+
841
+ // process path from front to back, procedurally creating directories
842
+ while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
843
+ const std::string subpath = path.substr(0, pos_slash);
844
+ struct stat info;
845
+
846
+ // if the path already exists, ensure that it's a directory
847
+ if (stat(subpath.c_str(), &info) == 0) {
848
+ if (!S_ISDIR(info.st_mode)) {
849
+ return false;
850
+ }
851
+ } else {
852
+ // create parent directories
853
+ const int ret = mkdir(subpath.c_str(), 0755);
854
+ if (ret != 0) {
855
+ return false;
856
+ }
857
+ }
858
+
859
+ pos_slash += 1;
860
+ }
861
+
862
+ return true;
863
+ #endif // _WIN32
864
+ }
865
+
866
+ std::string fs_get_cache_directory() {
867
+ std::string cache_directory = "";
868
+ auto ensure_trailing_slash = [](std::string p) {
869
+ // Make sure to add trailing slash
870
+ if (p.back() != DIRECTORY_SEPARATOR) {
871
+ p += DIRECTORY_SEPARATOR;
872
+ }
873
+ return p;
874
+ };
875
+ if (getenv("LLAMA_CACHE")) {
876
+ cache_directory = std::getenv("LLAMA_CACHE");
877
+ } else {
878
+ #ifdef __linux__
879
+ if (std::getenv("XDG_CACHE_HOME")) {
880
+ cache_directory = std::getenv("XDG_CACHE_HOME");
881
+ } else {
882
+ cache_directory = std::getenv("HOME") + std::string("/.cache/");
883
+ }
884
+ #elif defined(__APPLE__)
885
+ cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
886
+ #elif defined(_WIN32)
887
+ cache_directory = std::getenv("LOCALAPPDATA");
888
+ #endif // __linux__
889
+ cache_directory = ensure_trailing_slash(cache_directory);
890
+ cache_directory += "llama.cpp";
891
+ }
892
+ return ensure_trailing_slash(cache_directory);
893
+ }
894
+
895
+ std::string fs_get_cache_file(const std::string & filename) {
896
+ LM_GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
897
+ std::string cache_directory = fs_get_cache_directory();
898
+ const bool success = fs_create_directory_with_parents(cache_directory);
899
+ if (!success) {
900
+ throw std::runtime_error("failed to create cache directory: " + cache_directory);
901
+ }
902
+ return cache_directory + filename;
903
+ }
904
+
905
+
906
+ //
907
+ // Model utils
908
+ //
909
+ struct common_init_result common_init_from_params(common_params & params) {
910
+ common_init_result iparams;
911
+ auto mparams = common_model_params_to_llama(params);
912
+
913
+ llama_model * model = nullptr;
914
+
915
+ if (!params.hf_repo.empty() && !params.hf_file.empty()) {
916
+ model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
917
+ } else if (!params.model_url.empty()) {
918
+ model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
919
+ } else {
920
+ model = llama_model_load_from_file(params.model.c_str(), mparams);
921
+ }
922
+
923
+ if (model == NULL) {
924
+ LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
925
+ return iparams;
926
+ }
927
+
928
+ const llama_vocab * vocab = llama_model_get_vocab(model);
929
+
930
+ if (params.reranking) {
931
+ bool ok = true;
932
+
933
+ if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
934
+ LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
935
+ ok = false;
936
+ }
937
+
938
+ if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
939
+ LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
940
+ ok = false;
941
+ }
942
+
943
+ if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
944
+ LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
945
+ ok = false;
946
+ }
947
+
948
+ if (!ok) {
949
+ llama_model_free(model);
950
+
951
+ return iparams;
952
+ }
953
+ }
954
+
955
+ auto cparams = common_context_params_to_llama(params);
956
+
957
+ llama_context * lctx = llama_init_from_model(model, cparams);
958
+ if (lctx == NULL) {
959
+ LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
960
+ llama_model_free(model);
961
+ return iparams;
962
+ }
963
+
964
+ if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
965
+ LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
966
+ params.ctx_shift = false;
967
+ }
968
+
969
+ if (!params.control_vectors.empty()) {
970
+ if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
971
+ if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
972
+
973
+ const auto cvec = common_control_vector_load(params.control_vectors);
974
+ if (cvec.n_embd == -1) {
975
+ llama_free(lctx);
976
+ llama_model_free(model);
977
+
978
+ return iparams;
979
+ }
980
+
981
+ int err = llama_apply_adapter_cvec(
982
+ lctx,
983
+ cvec.data.data(),
984
+ cvec.data.size(),
985
+ cvec.n_embd,
986
+ params.control_vector_layer_start,
987
+ params.control_vector_layer_end);
988
+ if (err) {
989
+ llama_free(lctx);
990
+ llama_model_free(model);
991
+
992
+ return iparams;
993
+ }
994
+ }
995
+
996
+ // load and optionally apply lora adapters
997
+ for (auto & la : params.lora_adapters) {
998
+ llama_adapter_lora_ptr lora;
999
+ lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
1000
+ if (lora == nullptr) {
1001
+ LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
1002
+ llama_free(lctx);
1003
+ llama_model_free(model);
1004
+ return iparams;
1005
+ }
1006
+
1007
+ la.ptr = lora.get();
1008
+ iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
1009
+ }
1010
+
1011
+ if (!params.lora_init_without_apply) {
1012
+ common_set_adapter_lora(lctx, params.lora_adapters);
1013
+ }
1014
+
1015
+ if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
1016
+ LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
1017
+ params.sampling.ignore_eos = false;
1018
+ }
1019
+
1020
+ if (params.sampling.ignore_eos) {
1021
+ for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1022
+ if (llama_vocab_is_eog(vocab, i)) {
1023
+ LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1024
+ params.sampling.logit_bias.push_back({i, -INFINITY});
1025
+ }
1026
+ }
1027
+ }
1028
+
1029
+ if (params.sampling.penalty_last_n == -1) {
1030
+ LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1031
+ params.sampling.penalty_last_n = llama_n_ctx(lctx);
1032
+ }
1033
+
1034
+ if (params.sampling.dry_penalty_last_n == -1) {
1035
+ LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1036
+ params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
1037
+ }
1038
+
1039
+ if (params.warmup) {
1040
+ LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
1041
+
1042
+ std::vector<llama_token> tmp;
1043
+ llama_token bos = llama_vocab_bos(vocab);
1044
+ llama_token eos = llama_vocab_eos(vocab);
1045
+
1046
+ // some models (e.g. T5) don't have a BOS token
1047
+ if (bos != LLAMA_TOKEN_NULL) {
1048
+ tmp.push_back(bos);
1049
+ }
1050
+ if (eos != LLAMA_TOKEN_NULL) {
1051
+ tmp.push_back(eos);
1052
+ }
1053
+ if (tmp.empty()) {
1054
+ tmp.push_back(0);
1055
+ }
1056
+
1057
+ if (llama_model_has_encoder(model)) {
1058
+ llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
1059
+ llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
1060
+ if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
1061
+ decoder_start_token_id = bos;
1062
+ }
1063
+ tmp.clear();
1064
+ tmp.push_back(decoder_start_token_id);
1065
+ }
1066
+ if (llama_model_has_decoder(model)) {
1067
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
1068
+ }
1069
+ llama_kv_cache_clear(lctx);
1070
+ llama_synchronize(lctx);
1071
+ llama_perf_context_reset(lctx);
1072
+ }
1073
+
1074
+ iparams.model.reset(model);
1075
+ iparams.context.reset(lctx);
1076
+
1077
+ return iparams;
1078
+ }
1079
+
1080
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
1081
+ llama_clear_adapter_lora(ctx);
1082
+ for (auto & la : lora) {
1083
+ if (la.scale != 0.0f) {
1084
+ llama_set_adapter_lora(ctx, la.ptr, la.scale);
1085
+ }
1086
+ }
1087
+ }
1088
+
1089
+ struct llama_model_params common_model_params_to_llama(common_params & params) {
1090
+ auto mparams = llama_model_default_params();
1091
+
1092
+ if (!params.devices.empty()) {
1093
+ mparams.devices = params.devices.data();
1094
+ }
1095
+ if (params.n_gpu_layers != -1) {
1096
+ mparams.n_gpu_layers = params.n_gpu_layers;
1097
+ }
1098
+
1099
+ mparams.progress_callback_user_data = params.progress_callback_user_data;
1100
+ mparams.progress_callback = params.progress_callback;
1101
+ mparams.vocab_only = params.vocab_only;
1102
+ mparams.main_gpu = params.main_gpu;
1103
+ mparams.split_mode = params.split_mode;
1104
+ mparams.tensor_split = params.tensor_split;
1105
+ mparams.use_mmap = params.use_mmap;
1106
+ mparams.use_mlock = params.use_mlock;
1107
+ mparams.check_tensors = params.check_tensors;
1108
+ if (params.kv_overrides.empty()) {
1109
+ mparams.kv_overrides = NULL;
1110
+ } else {
1111
+ LM_GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
1112
+ mparams.kv_overrides = params.kv_overrides.data();
1113
+ }
1114
+
1115
+ return mparams;
1116
+ }
1117
+
1118
+ struct llama_context_params common_context_params_to_llama(const common_params & params) {
1119
+ auto cparams = llama_context_default_params();
1120
+
1121
+ cparams.n_ctx = params.n_ctx;
1122
+ cparams.n_seq_max = params.n_parallel;
1123
+ cparams.n_batch = params.n_batch;
1124
+ cparams.n_ubatch = params.n_ubatch;
1125
+ cparams.n_threads = params.cpuparams.n_threads;
1126
+ cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1127
+ params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1128
+ cparams.logits_all = params.logits_all;
1129
+ cparams.embeddings = params.embedding;
1130
+ cparams.rope_scaling_type = params.rope_scaling_type;
1131
+ cparams.rope_freq_base = params.rope_freq_base;
1132
+ cparams.rope_freq_scale = params.rope_freq_scale;
1133
+ cparams.yarn_ext_factor = params.yarn_ext_factor;
1134
+ cparams.yarn_attn_factor = params.yarn_attn_factor;
1135
+ cparams.yarn_beta_fast = params.yarn_beta_fast;
1136
+ cparams.yarn_beta_slow = params.yarn_beta_slow;
1137
+ cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1138
+ cparams.pooling_type = params.pooling_type;
1139
+ cparams.attention_type = params.attention_type;
1140
+ cparams.defrag_thold = params.defrag_thold;
1141
+ cparams.cb_eval = params.cb_eval;
1142
+ cparams.cb_eval_user_data = params.cb_eval_user_data;
1143
+ cparams.offload_kqv = !params.no_kv_offload;
1144
+ cparams.flash_attn = params.flash_attn;
1145
+ cparams.no_perf = params.no_perf;
1146
+
1147
+ if (params.reranking) {
1148
+ cparams.embeddings = true;
1149
+ cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1150
+ }
1151
+
1152
+ cparams.type_k = params.cache_type_k;
1153
+ cparams.type_v = params.cache_type_v;
1154
+
1155
+ return cparams;
1156
+ }
1157
+
1158
+ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
1159
+ struct lm_ggml_threadpool_params tpp;
1160
+
1161
+ lm_ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
1162
+
1163
+ if (params.mask_valid) {
1164
+ std::memcpy(&tpp.cpumask, &params.cpumask, LM_GGML_MAX_N_THREADS);
1165
+ }
1166
+
1167
+ tpp.prio = params.priority;
1168
+ tpp.poll = params.poll;
1169
+ tpp.strict_cpu = params.strict_cpu;
1170
+
1171
+ return tpp;
1172
+ }
1173
+
1174
+ #ifdef LLAMA_USE_CURL
1175
+
1176
+ #define CURL_MAX_RETRY 3
1177
+ #define CURL_RETRY_DELAY_SECONDS 2
1178
+
1179
+ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
1180
+ int remaining_attempts = max_attempts;
1181
+
1182
+ while (remaining_attempts > 0) {
1183
+ LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
1184
+
1185
+ CURLcode res = curl_easy_perform(curl);
1186
+ if (res == CURLE_OK) {
1187
+ return true;
1188
+ }
1189
+
1190
+ int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
1191
+ LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
1192
+
1193
+ remaining_attempts--;
1194
+ std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
1195
+ }
1196
+
1197
+ LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
1198
+
1199
+ return false;
1200
+ }
1201
+
1202
+ static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1203
+ // Initialize libcurl
1204
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
1205
+ curl_slist_ptr http_headers;
1206
+ if (!curl) {
1207
+ LOG_ERR("%s: error initializing libcurl\n", __func__);
1208
+ return false;
1209
+ }
1210
+
1211
+ bool force_download = false;
1212
+
1213
+ // Set the URL, allow to follow http redirection
1214
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
1215
+ curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
1216
+
1217
+ // Check if hf-token or bearer-token was specified
1218
+ if (!hf_token.empty()) {
1219
+ std::string auth_header = "Authorization: Bearer " + hf_token;
1220
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
1221
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
1222
+ }
1223
+
1224
+ #if defined(_WIN32)
1225
+ // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
1226
+ // operating system. Currently implemented under MS-Windows.
1227
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
1228
+ #endif
1229
+
1230
+ // Check if the file already exists locally
1231
+ auto file_exists = std::filesystem::exists(path);
1232
+
1233
+ // If the file exists, check its JSON metadata companion file.
1234
+ std::string metadata_path = path + ".json";
1235
+ nlohmann::json metadata;
1236
+ std::string etag;
1237
+ std::string last_modified;
1238
+
1239
+ if (file_exists) {
1240
+ // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
1241
+ std::ifstream metadata_in(metadata_path);
1242
+ if (metadata_in.good()) {
1243
+ try {
1244
+ metadata_in >> metadata;
1245
+ LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
1246
+ if (metadata.contains("url") && metadata.at("url").is_string()) {
1247
+ auto previous_url = metadata.at("url").get<std::string>();
1248
+ if (previous_url != url) {
1249
+ LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
1250
+ return false;
1251
+ }
1252
+ }
1253
+ if (metadata.contains("etag") && metadata.at("etag").is_string()) {
1254
+ etag = metadata.at("etag");
1255
+ }
1256
+ if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
1257
+ last_modified = metadata.at("lastModified");
1258
+ }
1259
+ } catch (const nlohmann::json::exception & e) {
1260
+ LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
1261
+ return false;
1262
+ }
1263
+ }
1264
+ } else {
1265
+ LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
1266
+ }
1267
+
1268
+ // Send a HEAD request to retrieve the etag and last-modified headers
1269
+ struct common_load_model_from_url_headers {
1270
+ std::string etag;
1271
+ std::string last_modified;
1272
+ };
1273
+
1274
+ common_load_model_from_url_headers headers;
1275
+
1276
+ {
1277
+ typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
1278
+ auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1279
+ common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
1280
+
1281
+ static std::regex header_regex("([^:]+): (.*)\r\n");
1282
+ static std::regex etag_regex("ETag", std::regex_constants::icase);
1283
+ static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
1284
+
1285
+ std::string header(buffer, n_items);
1286
+ std::smatch match;
1287
+ if (std::regex_match(header, match, header_regex)) {
1288
+ const std::string & key = match[1];
1289
+ const std::string & value = match[2];
1290
+ if (std::regex_match(key, match, etag_regex)) {
1291
+ headers->etag = value;
1292
+ } else if (std::regex_match(key, match, last_modified_regex)) {
1293
+ headers->last_modified = value;
1294
+ }
1295
+ }
1296
+ return n_items;
1297
+ };
1298
+
1299
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
1300
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
1301
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
1302
+ curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
1303
+
1304
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
1305
+ if (!was_perform_successful) {
1306
+ return false;
1307
+ }
1308
+
1309
+ long http_code = 0;
1310
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
1311
+ if (http_code != 200) {
1312
+ // HEAD not supported, we don't know if the file has changed
1313
+ // force trigger downloading
1314
+ force_download = true;
1315
+ LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
1316
+ }
1317
+ }
1318
+
1319
+ bool should_download = !file_exists || force_download;
1320
+ if (!should_download) {
1321
+ if (!etag.empty() && etag != headers.etag) {
1322
+ LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
1323
+ should_download = true;
1324
+ } else if (!last_modified.empty() && last_modified != headers.last_modified) {
1325
+ LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
1326
+ should_download = true;
1327
+ }
1328
+ }
1329
+ if (should_download) {
1330
+ std::string path_temporary = path + ".downloadInProgress";
1331
+ if (file_exists) {
1332
+ LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
1333
+ if (remove(path.c_str()) != 0) {
1334
+ LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
1335
+ return false;
1336
+ }
1337
+ }
1338
+
1339
+ // Set the output file
1340
+
1341
+ struct FILE_deleter {
1342
+ void operator()(FILE * f) const {
1343
+ fclose(f);
1344
+ }
1345
+ };
1346
+
1347
+ std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
1348
+ if (!outfile) {
1349
+ LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
1350
+ return false;
1351
+ }
1352
+
1353
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
1354
+ auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
1355
+ return fwrite(data, size, nmemb, (FILE *)fd);
1356
+ };
1357
+ curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
1358
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
1359
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
1360
+
1361
+ // display download progress
1362
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
1363
+
1364
+ // helper function to hide password in URL
1365
+ auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
1366
+ std::size_t protocol_pos = url.find("://");
1367
+ if (protocol_pos == std::string::npos) {
1368
+ return url; // Malformed URL
1369
+ }
1370
+
1371
+ std::size_t at_pos = url.find('@', protocol_pos + 3);
1372
+ if (at_pos == std::string::npos) {
1373
+ return url; // No password in URL
1374
+ }
1375
+
1376
+ return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
1377
+ };
1378
+
1379
+ // start the download
1380
+ LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
1381
+ llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
1382
+ bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
1383
+ if (!was_perform_successful) {
1384
+ return false;
1385
+ }
1386
+
1387
+ long http_code = 0;
1388
+ curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
1389
+ if (http_code < 200 || http_code >= 400) {
1390
+ LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
1391
+ return false;
1392
+ }
1393
+
1394
+ // Causes file to be closed explicitly here before we rename it.
1395
+ outfile.reset();
1396
+
1397
+ // Write the updated JSON metadata file.
1398
+ metadata.update({
1399
+ {"url", url},
1400
+ {"etag", headers.etag},
1401
+ {"lastModified", headers.last_modified}
1402
+ });
1403
+ std::ofstream(metadata_path) << metadata.dump(4);
1404
+ LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
1405
+
1406
+ if (rename(path_temporary.c_str(), path.c_str()) != 0) {
1407
+ LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
1408
+ return false;
1409
+ }
1410
+ }
1411
+
1412
+ return true;
1413
+ }
1414
+
1415
+ struct llama_model * common_load_model_from_url(
1416
+ const std::string & model_url,
1417
+ const std::string & local_path,
1418
+ const std::string & hf_token,
1419
+ const struct llama_model_params & params) {
1420
+ // Basic validation of the model_url
1421
+ if (model_url.empty()) {
1422
+ LOG_ERR("%s: invalid model_url\n", __func__);
1423
+ return NULL;
1424
+ }
1425
+
1426
+ if (!common_download_file(model_url, local_path, hf_token)) {
1427
+ return NULL;
1428
+ }
1429
+
1430
+ // check for additional GGUFs split to download
1431
+ int n_split = 0;
1432
+ {
1433
+ struct lm_gguf_init_params lm_gguf_params = {
1434
+ /*.no_alloc = */ true,
1435
+ /*.ctx = */ NULL,
1436
+ };
1437
+ auto * ctx_gguf = lm_gguf_init_from_file(local_path.c_str(), lm_gguf_params);
1438
+ if (!ctx_gguf) {
1439
+ LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
1440
+ return NULL;
1441
+ }
1442
+
1443
+ auto key_n_split = lm_gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
1444
+ if (key_n_split >= 0) {
1445
+ n_split = lm_gguf_get_val_u16(ctx_gguf, key_n_split);
1446
+ }
1447
+
1448
+ lm_gguf_free(ctx_gguf);
1449
+ }
1450
+
1451
+ if (n_split > 1) {
1452
+ char split_prefix[PATH_MAX] = {0};
1453
+ char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1454
+
1455
+ // Verify the first split file format
1456
+ // and extract split URL and PATH prefixes
1457
+ {
1458
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
1459
+ LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
1460
+ return NULL;
1461
+ }
1462
+
1463
+ if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
1464
+ LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
1465
+ return NULL;
1466
+ }
1467
+ }
1468
+
1469
+ // Prepare download in parallel
1470
+ std::vector<std::future<bool>> futures_download;
1471
+ for (int idx = 1; idx < n_split; idx++) {
1472
+ futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
1473
+ char split_path[PATH_MAX] = {0};
1474
+ llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
1475
+
1476
+ char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1477
+ llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
1478
+
1479
+ return common_download_file(split_url, split_path, hf_token);
1480
+ }, idx));
1481
+ }
1482
+
1483
+ // Wait for all downloads to complete
1484
+ for (auto & f : futures_download) {
1485
+ if (!f.get()) {
1486
+ return NULL;
1487
+ }
1488
+ }
1489
+ }
1490
+
1491
+ return llama_model_load_from_file(local_path.c_str(), params);
1492
+ }
1493
+
1494
+ struct llama_model * common_load_model_from_hf(
1495
+ const std::string & repo,
1496
+ const std::string & remote_path,
1497
+ const std::string & local_path,
1498
+ const std::string & hf_token,
1499
+ const struct llama_model_params & params) {
1500
+ // construct hugging face model url:
1501
+ //
1502
+ // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
1503
+ // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
1504
+ //
1505
+ // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
1506
+ // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
1507
+ //
1508
+
1509
+ std::string model_url = "https://huggingface.co/";
1510
+ model_url += repo;
1511
+ model_url += "/resolve/main/";
1512
+ model_url += remote_path;
1513
+
1514
+ return common_load_model_from_url(model_url, local_path, hf_token, params);
1515
+ }
1516
+
1517
+ /**
1518
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
1519
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
1520
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
1521
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
1522
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
1523
+ *
1524
+ * Return pair of <repo, file> (with "repo" already having tag removed)
1525
+ *
1526
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
1527
+ */
1528
+ std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
1529
+ auto parts = string_split<std::string>(hf_repo_with_tag, ':');
1530
+ std::string tag = parts.size() > 1 ? parts.back() : "latest";
1531
+ std::string hf_repo = parts[0];
1532
+ if (string_split<std::string>(hf_repo, '/').size() != 2) {
1533
+ throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
1534
+ }
1535
+
1536
+ // fetch model info from Hugging Face Hub API
1537
+ json model_info;
1538
+ curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
1539
+ curl_slist_ptr http_headers;
1540
+ std::string res_str;
1541
+ std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
1542
+ curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
1543
+ curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
1544
+ typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
1545
+ auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
1546
+ static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
1547
+ return size * nmemb;
1548
+ };
1549
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
1550
+ curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
1551
+ #if defined(_WIN32)
1552
+ curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
1553
+ #endif
1554
+ if (!hf_token.empty()) {
1555
+ std::string auth_header = "Authorization: Bearer " + hf_token;
1556
+ http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
1557
+ }
1558
+ // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
1559
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
1560
+ http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
1561
+ curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
1562
+
1563
+ CURLcode res = curl_easy_perform(curl.get());
1564
+
1565
+ if (res != CURLE_OK) {
1566
+ throw std::runtime_error("error: cannot make GET request to HF API");
1567
+ }
1568
+
1569
+ long res_code;
1570
+ curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
1571
+ if (res_code == 200) {
1572
+ model_info = json::parse(res_str);
1573
+ } else if (res_code == 401) {
1574
+ throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
1575
+ } else {
1576
+ throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
1577
+ }
1578
+
1579
+ // check response
1580
+ if (!model_info.contains("ggufFile")) {
1581
+ throw std::runtime_error("error: model does not have ggufFile");
1582
+ }
1583
+ json & lm_gguf_file = model_info.at("ggufFile");
1584
+ if (!lm_gguf_file.contains("rfilename")) {
1585
+ throw std::runtime_error("error: ggufFile does not have rfilename");
1586
+ }
1587
+
1588
+ return std::make_pair(hf_repo, lm_gguf_file.at("rfilename"));
1589
+ }
1590
+
1591
+ #else
1592
+
1593
+ struct llama_model * common_load_model_from_url(
1594
+ const std::string & /*model_url*/,
1595
+ const std::string & /*local_path*/,
1596
+ const std::string & /*hf_token*/,
1597
+ const struct llama_model_params & /*params*/) {
1598
+ LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
1599
+ return nullptr;
1600
+ }
1601
+
1602
+ struct llama_model * common_load_model_from_hf(
1603
+ const std::string & /*repo*/,
1604
+ const std::string & /*remote_path*/,
1605
+ const std::string & /*local_path*/,
1606
+ const std::string & /*hf_token*/,
1607
+ const struct llama_model_params & /*params*/) {
1608
+ LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1609
+ return nullptr;
1610
+ }
1611
+
1612
+ std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
1613
+ LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1614
+ return std::make_pair("", "");
1615
+ }
1616
+
1617
+ #endif // LLAMA_USE_CURL
1618
+
1619
+ //
1620
+ // Batch utils
1621
+ //
1622
+
1623
+ void common_batch_clear(struct llama_batch & batch) {
1624
+ batch.n_tokens = 0;
1625
+ }
1626
+
1627
+ void common_batch_add(
1628
+ struct llama_batch & batch,
1629
+ llama_token id,
1630
+ llama_pos pos,
1631
+ const std::vector<llama_seq_id> & seq_ids,
1632
+ bool logits) {
1633
+ LM_GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
1634
+
1635
+ batch.token [batch.n_tokens] = id;
1636
+ batch.pos [batch.n_tokens] = pos;
1637
+ batch.n_seq_id[batch.n_tokens] = seq_ids.size();
1638
+ for (size_t i = 0; i < seq_ids.size(); ++i) {
1639
+ batch.seq_id[batch.n_tokens][i] = seq_ids[i];
1640
+ }
1641
+ batch.logits [batch.n_tokens] = logits;
1642
+
1643
+ batch.n_tokens++;
1644
+ }
1645
+
1646
+ //
1647
+ // Token utils
1648
+ //
1649
+
1650
+ size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
1651
+ size_t i;
1652
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
1653
+
1654
+ return i;
1655
+ }
1656
+
1657
+ size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
1658
+ // check for empty sequences
1659
+ if (a.empty() || b.empty()) {
1660
+ return 0;
1661
+ }
1662
+
1663
+ // get the lengths of the input sequences
1664
+ size_t a_len = a.size();
1665
+ size_t b_len = b.size();
1666
+
1667
+ // initialize the maximum length of the longest common subsequence (LCS)
1668
+ size_t max_length = 0;
1669
+
1670
+ // use two rows instead of a 2D matrix to optimize space
1671
+ std::vector<size_t> prev_row(b_len + 1, 0);
1672
+ std::vector<size_t> curr_row(b_len + 1, 0);
1673
+
1674
+ // iterate through the elements of a
1675
+ for (size_t i = 1; i <= a_len; i++) {
1676
+ // iterate through the elements of b
1677
+ for (size_t j = 1; j <= b_len; j++) {
1678
+ // if elements at the current positions match
1679
+ if (a[i - 1] == b[j - 1]) {
1680
+ // if it's the first element of either sequences, set LCS length to 1
1681
+ if (i == 1 || j == 1) {
1682
+ curr_row[j] = 1;
1683
+ } else {
1684
+ // increment LCS length by 1 compared to the previous element
1685
+ curr_row[j] = prev_row[j - 1] + 1;
1686
+ }
1687
+
1688
+ // update max_length if necessary
1689
+ if (curr_row[j] > max_length) {
1690
+ max_length = curr_row[j];
1691
+ }
1692
+ } else {
1693
+ // reset LCS length if elements don't match
1694
+ curr_row[j] = 0;
1695
+ }
1696
+ }
1697
+
1698
+ // update the previous row for the next iteration
1699
+ prev_row = curr_row;
1700
+ }
1701
+
1702
+ // return the maximum length of the LCS
1703
+ return max_length;
1704
+ }
1705
+
1706
+ //
1707
+ // Vocab utils
1708
+ //
1709
+
1710
+ std::vector<llama_token> common_tokenize(
1711
+ const struct llama_context * ctx,
1712
+ const std::string & text,
1713
+ bool add_special,
1714
+ bool parse_special) {
1715
+ const llama_model * model = llama_get_model(ctx);
1716
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1717
+ return common_tokenize(vocab, text, add_special, parse_special);
1718
+ }
1719
+
1720
+ std::vector<llama_token> common_tokenize(
1721
+ const struct llama_vocab * vocab,
1722
+ const std::string & text,
1723
+ bool add_special,
1724
+ bool parse_special) {
1725
+ // upper limit for the number of tokens
1726
+ int n_tokens = text.length() + 2 * add_special;
1727
+ std::vector<llama_token> result(n_tokens);
1728
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1729
+ if (n_tokens < 0) {
1730
+ result.resize(-n_tokens);
1731
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1732
+ LM_GGML_ASSERT(check == -n_tokens);
1733
+ } else {
1734
+ result.resize(n_tokens);
1735
+ }
1736
+ return result;
1737
+ }
1738
+
1739
+ std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1740
+ const llama_model * model = llama_get_model(ctx);
1741
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1742
+ return common_token_to_piece(vocab, token, special);
1743
+ }
1744
+
1745
+ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
1746
+ std::string piece;
1747
+ piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1748
+ const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
1749
+ if (n_chars < 0) {
1750
+ piece.resize(-n_chars);
1751
+ int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
1752
+ LM_GGML_ASSERT(check == -n_chars);
1753
+ }
1754
+ else {
1755
+ piece.resize(n_chars);
1756
+ }
1757
+
1758
+ return piece;
1759
+ }
1760
+
1761
+ std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1762
+ const llama_model * model = llama_get_model(ctx);
1763
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1764
+ return common_detokenize(vocab, tokens, special);
1765
+ }
1766
+
1767
+ std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
1768
+ std::string text;
1769
+ text.resize(std::max(text.capacity(), tokens.size()));
1770
+ int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1771
+ if (n_chars < 0) {
1772
+ text.resize(-n_chars);
1773
+ n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1774
+ LM_GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
1775
+ }
1776
+
1777
+ text.resize(n_chars);
1778
+
1779
+ // NOTE: the original tokenizer decodes bytes after collecting the pieces.
1780
+ return text;
1781
+ }
1782
+
1783
+ //
1784
+ // KV cache utils
1785
+ //
1786
+
1787
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1788
+ static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1789
+
1790
+ printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1791
+ view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1792
+
1793
+ llama_kv_cache_view_cell * c_curr = view.cells;
1794
+ llama_seq_id * cs_curr = view.cells_sequences;
1795
+
1796
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1797
+ if (i % row_size == 0) {
1798
+ printf("\n%5d: ", i);
1799
+ }
1800
+ int seq_count = 0;
1801
+ for (int j = 0; j < view.n_seq_max; j++) {
1802
+ if (cs_curr[j] >= 0) { seq_count++; }
1803
+ }
1804
+ putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1805
+ }
1806
+
1807
+ printf("\n=== Done dumping\n");
1808
+ }
1809
+
1810
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1811
+ static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1812
+
1813
+ printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1814
+ view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1815
+
1816
+ std::unordered_map<llama_seq_id, size_t> seqs;
1817
+ llama_kv_cache_view_cell * c_curr = view.cells;
1818
+ llama_seq_id * cs_curr = view.cells_sequences;
1819
+
1820
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1821
+ for (int j = 0; j < view.n_seq_max; j++) {
1822
+ if (cs_curr[j] < 0) { continue; }
1823
+ if (seqs.find(cs_curr[j]) == seqs.end()) {
1824
+ if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1825
+ const size_t sz = seqs.size();
1826
+ seqs[cs_curr[j]] = sz;
1827
+ }
1828
+ }
1829
+ if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1830
+ }
1831
+
1832
+ printf("=== Sequence legend: ");
1833
+ for (const auto & it : seqs) {
1834
+ printf("%zu=%d, ", it.second, it.first);
1835
+ }
1836
+ printf("'+'=other sequence ids");
1837
+
1838
+ c_curr = view.cells;
1839
+ cs_curr = view.cells_sequences;
1840
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1841
+ if (i % row_size == 0) {
1842
+ printf("\n%5d: ", i);
1843
+ }
1844
+ for (int j = 0; j < view.n_seq_max; j++) {
1845
+ if (cs_curr[j] >= 0) {
1846
+ const auto & it = seqs.find(cs_curr[j]);
1847
+ putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1848
+ } else {
1849
+ putchar('.');
1850
+ }
1851
+ }
1852
+ putchar(' ');
1853
+ }
1854
+
1855
+ printf("\n=== Done dumping\n");
1856
+ }
1857
+
1858
+ //
1859
+ // Embedding utils
1860
+ //
1861
+
1862
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
1863
+ double sum = 0.0;
1864
+
1865
+ switch (embd_norm) {
1866
+ case -1: // no normalisation
1867
+ sum = 1.0;
1868
+ break;
1869
+ case 0: // max absolute
1870
+ for (int i = 0; i < n; i++) {
1871
+ if (sum < std::abs(inp[i])) {
1872
+ sum = std::abs(inp[i]);
1873
+ }
1874
+ }
1875
+ sum /= 32760.0; // make an int16 range
1876
+ break;
1877
+ case 2: // euclidean
1878
+ for (int i = 0; i < n; i++) {
1879
+ sum += inp[i] * inp[i];
1880
+ }
1881
+ sum = std::sqrt(sum);
1882
+ break;
1883
+ default: // p-norm (euclidean is p-norm p=2)
1884
+ for (int i = 0; i < n; i++) {
1885
+ sum += std::pow(std::abs(inp[i]), embd_norm);
1886
+ }
1887
+ sum = std::pow(sum, 1.0 / embd_norm);
1888
+ break;
1889
+ }
1890
+
1891
+ const float norm = sum > 0.0 ? 1.0 / sum : 0.0f;
1892
+
1893
+ for (int i = 0; i < n; i++) {
1894
+ out[i] = inp[i] * norm;
1895
+ }
1896
+ }
1897
+
1898
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
1899
+ double sum = 0.0;
1900
+ double sum1 = 0.0;
1901
+ double sum2 = 0.0;
1902
+
1903
+ for (int i = 0; i < n; i++) {
1904
+ sum += embd1[i] * embd2[i];
1905
+ sum1 += embd1[i] * embd1[i];
1906
+ sum2 += embd2[i] * embd2[i];
1907
+ }
1908
+
1909
+ // Handle the case where one or both vectors are zero vectors
1910
+ if (sum1 == 0.0 || sum2 == 0.0) {
1911
+ if (sum1 == 0.0 && sum2 == 0.0) {
1912
+ return 1.0f; // two zero vectors are similar
1913
+ }
1914
+ return 0.0f;
1915
+ }
1916
+
1917
+ return sum / (sqrt(sum1) * sqrt(sum2));
1918
+ }
1919
+
1920
+ //
1921
+ // Control vector utils
1922
+ //
1923
+
1924
+ static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
1925
+ common_control_vector_data result = { -1, {} };
1926
+
1927
+ lm_ggml_context * ctx = nullptr;
1928
+ struct lm_gguf_init_params meta_lm_gguf_params = {
1929
+ /* .no_alloc = */ false,
1930
+ /* .ctx = */ &ctx,
1931
+ };
1932
+ struct lm_gguf_context * ctx_gguf = lm_gguf_init_from_file(load_info.fname.c_str(), meta_lm_gguf_params);
1933
+ if (!ctx_gguf) {
1934
+ LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
1935
+ return result;
1936
+ }
1937
+
1938
+ int32_t n_tensors = lm_gguf_get_n_tensors(ctx_gguf);
1939
+ if (n_tensors == 0) {
1940
+ LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
1941
+ }
1942
+
1943
+ for (int i = 0; i < n_tensors; i++) {
1944
+ std::string name = lm_gguf_get_tensor_name(ctx_gguf, i);
1945
+
1946
+ int layer_idx = -1;
1947
+
1948
+ // split on '.'
1949
+ size_t dotpos = name.find('.');
1950
+ if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
1951
+ try {
1952
+ layer_idx = std::stoi(name.substr(dotpos + 1));
1953
+ } catch (...) {
1954
+ layer_idx = -1;
1955
+ }
1956
+ }
1957
+ if (layer_idx < 0) {
1958
+ LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1959
+ result.n_embd = -1;
1960
+ break;
1961
+ } else if (layer_idx == 0) {
1962
+ LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
1963
+ result.n_embd = -1;
1964
+ break;
1965
+ }
1966
+
1967
+ struct lm_ggml_tensor * tensor = lm_ggml_get_tensor(ctx, name.c_str());
1968
+ if (tensor->type != LM_GGML_TYPE_F32) {
1969
+ LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
1970
+ result.n_embd = -1;
1971
+ break;
1972
+ }
1973
+ if (lm_ggml_n_dims(tensor) != 1) {
1974
+ LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
1975
+ result.n_embd = -1;
1976
+ break;
1977
+ }
1978
+
1979
+ if (result.n_embd == -1) {
1980
+ result.n_embd = lm_ggml_nelements(tensor);
1981
+ } else if (lm_ggml_nelements(tensor) != result.n_embd) {
1982
+ LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
1983
+ result.n_embd = -1;
1984
+ break;
1985
+ }
1986
+
1987
+ // extend if necessary - do not store data for layer 0 (it's not used)
1988
+ result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
1989
+
1990
+ const float * src = (const float *) tensor->data;
1991
+ float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
1992
+ for (int j = 0; j < result.n_embd; j++) {
1993
+ dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
1994
+ }
1995
+
1996
+ }
1997
+
1998
+ if (result.n_embd == -1) {
1999
+ LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
2000
+ result.data.clear();
2001
+ }
2002
+
2003
+ lm_gguf_free(ctx_gguf);
2004
+ lm_ggml_free(ctx);
2005
+
2006
+ return result;
2007
+ }
2008
+
2009
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
2010
+ common_control_vector_data result = { -1, {} };
2011
+
2012
+ for (const auto & info : load_infos) {
2013
+ auto cur = common_control_vector_load_one(info);
2014
+
2015
+ if (cur.n_embd == -1) {
2016
+ result.n_embd = -1;
2017
+ break;
2018
+ }
2019
+ if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
2020
+ LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
2021
+ result.n_embd = -1;
2022
+ break;
2023
+ }
2024
+
2025
+ if (result.n_embd == -1) {
2026
+ result = std::move(cur);
2027
+ } else {
2028
+ result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
2029
+ for (size_t i = 0; i < cur.data.size(); i++) {
2030
+ result.data[i] += cur.data[i];
2031
+ }
2032
+ }
2033
+ }
2034
+
2035
+ if (result.n_embd == -1) {
2036
+ LOG_ERR("%s: no valid control vector files passed\n", __func__);
2037
+ result.data.clear();
2038
+ }
2039
+
2040
+ return result;
2041
+ }
2042
+
2043
+ template <>
2044
+ json common_grammar_trigger::to_json() const {
2045
+ json out {
2046
+ {"type", (int) type},
2047
+ {"value", value},
2048
+ };
2049
+ if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
2050
+ out["token"] = (int) token;
2051
+ }
2052
+ return out;
2053
+ }
2054
+
2055
+ template <>
2056
+ common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
2057
+ common_grammar_trigger out;
2058
+ out.type = (common_grammar_trigger_type) in.at("type").get<int>();
2059
+ out.value = in.at("value").get<std::string>();
2060
+ if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
2061
+ out.token = (llama_token) in.at("token").get<int>();
2062
+ }
2063
+ return out;
2064
+ }