@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -1,3 +1,7 @@
1
+ #if defined(_MSC_VER)
2
+ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3
+ #endif
4
+
1
5
  #include "ggml-cpu.h"
2
6
 
3
7
  #ifdef GGML_USE_CUDA
@@ -18,26 +22,149 @@
18
22
 
19
23
  #include "ggml-rpc.h"
20
24
  #ifdef _WIN32
25
+ # define NOMINMAX
26
+ # define DIRECTORY_SEPARATOR '\\'
27
+ # include <locale>
21
28
  # include <windows.h>
29
+ # include <fcntl.h>
30
+ # include <io.h>
22
31
  #else
32
+ # define DIRECTORY_SEPARATOR '/'
23
33
  # include <unistd.h>
34
+ # include <sys/stat.h>
24
35
  #endif
36
+ #include <codecvt>
25
37
  #include <string>
26
38
  #include <stdio.h>
39
+ #include <vector>
40
+ #include <filesystem>
41
+ #include <algorithm>
42
+ #include <thread>
43
+
44
+ namespace fs = std::filesystem;
45
+
46
+ // NOTE: this is copied from common.cpp to avoid linking with libcommon
47
+ // returns true if successful, false otherwise
48
+ static bool fs_create_directory_with_parents(const std::string & path) {
49
+ #ifdef _WIN32
50
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
51
+ std::wstring wpath = converter.from_bytes(path);
52
+
53
+ // if the path already exists, check whether it's a directory
54
+ const DWORD attributes = GetFileAttributesW(wpath.c_str());
55
+ if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
56
+ return true;
57
+ }
58
+
59
+ size_t pos_slash = 0;
60
+
61
+ // process path from front to back, procedurally creating directories
62
+ while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
63
+ const std::wstring subpath = wpath.substr(0, pos_slash);
64
+ const wchar_t * test = subpath.c_str();
65
+
66
+ const bool success = CreateDirectoryW(test, NULL);
67
+ if (!success) {
68
+ const DWORD error = GetLastError();
69
+
70
+ // if the path already exists, ensure that it's a directory
71
+ if (error == ERROR_ALREADY_EXISTS) {
72
+ const DWORD attributes = GetFileAttributesW(subpath.c_str());
73
+ if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
74
+ return false;
75
+ }
76
+ } else {
77
+ return false;
78
+ }
79
+ }
80
+
81
+ pos_slash += 1;
82
+ }
83
+
84
+ return true;
85
+ #else
86
+ // if the path already exists, check whether it's a directory
87
+ struct stat info;
88
+ if (stat(path.c_str(), &info) == 0) {
89
+ return S_ISDIR(info.st_mode);
90
+ }
91
+
92
+ size_t pos_slash = 1; // skip leading slashes for directory creation
93
+
94
+ // process path from front to back, procedurally creating directories
95
+ while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
96
+ const std::string subpath = path.substr(0, pos_slash);
97
+ struct stat info;
98
+
99
+ // if the path already exists, ensure that it's a directory
100
+ if (stat(subpath.c_str(), &info) == 0) {
101
+ if (!S_ISDIR(info.st_mode)) {
102
+ return false;
103
+ }
104
+ } else {
105
+ // create parent directories
106
+ const int ret = mkdir(subpath.c_str(), 0755);
107
+ if (ret != 0) {
108
+ return false;
109
+ }
110
+ }
111
+
112
+ pos_slash += 1;
113
+ }
114
+
115
+ return true;
116
+ #endif // _WIN32
117
+ }
118
+
119
+ // NOTE: this is copied from common.cpp to avoid linking with libcommon
120
+ static std::string fs_get_cache_directory() {
121
+ std::string cache_directory = "";
122
+ auto ensure_trailing_slash = [](std::string p) {
123
+ // Make sure to add trailing slash
124
+ if (p.back() != DIRECTORY_SEPARATOR) {
125
+ p += DIRECTORY_SEPARATOR;
126
+ }
127
+ return p;
128
+ };
129
+ if (getenv("LLAMA_CACHE")) {
130
+ cache_directory = std::getenv("LLAMA_CACHE");
131
+ } else {
132
+ #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
133
+ if (std::getenv("XDG_CACHE_HOME")) {
134
+ cache_directory = std::getenv("XDG_CACHE_HOME");
135
+ } else {
136
+ cache_directory = std::getenv("HOME") + std::string("/.cache/");
137
+ }
138
+ #elif defined(__APPLE__)
139
+ cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
140
+ #elif defined(_WIN32)
141
+ cache_directory = std::getenv("LOCALAPPDATA");
142
+ #else
143
+ # error Unknown architecture
144
+ #endif
145
+ cache_directory = ensure_trailing_slash(cache_directory);
146
+ cache_directory += "llama.cpp";
147
+ }
148
+ return ensure_trailing_slash(cache_directory);
149
+ }
27
150
 
28
151
  struct rpc_server_params {
29
152
  std::string host = "127.0.0.1";
30
153
  int port = 50052;
31
154
  size_t backend_mem = 0;
155
+ bool use_cache = false;
156
+ int n_threads = std::max(1U, std::thread::hardware_concurrency()/2);
32
157
  };
33
158
 
34
159
  static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
35
160
  fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
36
161
  fprintf(stderr, "options:\n");
37
- fprintf(stderr, " -h, --help show this help message and exit\n");
38
- fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
39
- fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
40
- fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
162
+ fprintf(stderr, " -h, --help show this help message and exit\n");
163
+ fprintf(stderr, " -t, --threads number of threads for the CPU backend (default: %d)\n", params.n_threads);
164
+ fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
165
+ fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
166
+ fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
167
+ fprintf(stderr, " -c, --cache enable local file cache\n");
41
168
  fprintf(stderr, "\n");
42
169
  }
43
170
 
@@ -50,6 +177,15 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
50
177
  return false;
51
178
  }
52
179
  params.host = argv[i];
180
+ } else if (arg == "-t" || arg == "--threads") {
181
+ if (++i >= argc) {
182
+ return false;
183
+ }
184
+ params.n_threads = std::stoi(argv[i]);
185
+ if (params.n_threads <= 0) {
186
+ fprintf(stderr, "error: invalid number of threads: %d\n", params.n_threads);
187
+ return false;
188
+ }
53
189
  } else if (arg == "-p" || arg == "--port") {
54
190
  if (++i >= argc) {
55
191
  return false;
@@ -58,6 +194,8 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
58
194
  if (params.port <= 0 || params.port > 65535) {
59
195
  return false;
60
196
  }
197
+ } else if (arg == "-c" || arg == "--cache") {
198
+ params.use_cache = true;
61
199
  } else if (arg == "-m" || arg == "--mem") {
62
200
  if (++i >= argc) {
63
201
  return false;
@@ -75,7 +213,7 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
75
213
  return true;
76
214
  }
77
215
 
78
- static ggml_backend_t create_backend() {
216
+ static ggml_backend_t create_backend(const rpc_server_params & params) {
79
217
  ggml_backend_t backend = NULL;
80
218
  #ifdef GGML_USE_CUDA
81
219
  fprintf(stderr, "%s: using CUDA backend\n", __func__);
@@ -107,6 +245,7 @@ static ggml_backend_t create_backend() {
107
245
  if (!backend) {
108
246
  fprintf(stderr, "%s: using CPU backend\n", __func__);
109
247
  backend = ggml_backend_cpu_init();
248
+ ggml_backend_cpu_set_n_threads(backend, params.n_threads);
110
249
  }
111
250
  return backend;
112
251
  }
@@ -151,7 +290,7 @@ int main(int argc, char * argv[]) {
151
290
  fprintf(stderr, "\n");
152
291
  }
153
292
 
154
- ggml_backend_t backend = create_backend();
293
+ ggml_backend_t backend = create_backend(params);
155
294
  if (!backend) {
156
295
  fprintf(stderr, "Failed to create backend\n");
157
296
  return 1;
@@ -164,8 +303,24 @@ int main(int argc, char * argv[]) {
164
303
  } else {
165
304
  get_backend_memory(&free_mem, &total_mem);
166
305
  }
167
- printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
168
- ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
306
+ const char * cache_dir = nullptr;
307
+ std::string cache_dir_str;
308
+ if (params.use_cache) {
309
+ cache_dir_str = fs_get_cache_directory() + "rpc/";
310
+ if (!fs_create_directory_with_parents(cache_dir_str)) {
311
+ fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
312
+ return 1;
313
+ }
314
+ cache_dir = cache_dir_str.c_str();
315
+ }
316
+ printf("Starting RPC server v%d.%d.%d\n",
317
+ RPC_PROTO_MAJOR_VERSION,
318
+ RPC_PROTO_MINOR_VERSION,
319
+ RPC_PROTO_PATCH_VERSION);
320
+ printf(" endpoint : %s\n", endpoint.c_str());
321
+ printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a");
322
+ printf(" backend memory : %zu MB\n", free_mem / (1024 * 1024));
323
+ ggml_backend_rpc_start_server(backend, endpoint.c_str(), cache_dir, free_mem, total_mem);
169
324
  ggml_backend_free(backend);
170
325
  return 0;
171
326
  }
@@ -1,5 +1,16 @@
1
1
  set(TARGET llama-run)
2
2
  add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
3
+
4
+ # TODO: avoid copying this code block from common/CMakeLists.txt
5
+ set(LLAMA_RUN_EXTRA_LIBS "")
6
+ if (LLAMA_CURL)
7
+ find_package(CURL REQUIRED)
8
+ target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
9
+ include_directories(${CURL_INCLUDE_DIRS})
10
+ find_library(CURL_LIBRARY curl REQUIRED)
11
+ set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARY})
12
+ endif ()
13
+
3
14
  install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
15
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
5
16
  target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -38,24 +38,6 @@
38
38
  }
39
39
  #endif
40
40
 
41
- GGML_ATTRIBUTE_FORMAT(1, 2)
42
- static std::string fmt(const char * fmt, ...) {
43
- va_list ap;
44
- va_list ap2;
45
- va_start(ap, fmt);
46
- va_copy(ap2, ap);
47
- const int size = vsnprintf(NULL, 0, fmt, ap);
48
- GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
49
- std::string buf;
50
- buf.resize(size);
51
- const int size2 = vsnprintf(const_cast<char *>(buf.data()), buf.size() + 1, fmt, ap2);
52
- GGML_ASSERT(size2 == size);
53
- va_end(ap2);
54
- va_end(ap);
55
-
56
- return buf;
57
- }
58
-
59
41
  GGML_ATTRIBUTE_FORMAT(1, 2)
60
42
  static int printe(const char * fmt, ...) {
61
43
  va_list args;
@@ -525,11 +507,11 @@ class HttpClient {
525
507
  int secs = static_cast<int>(seconds) % 60;
526
508
 
527
509
  if (hrs > 0) {
528
- return fmt("%dh %02dm %02ds", hrs, mins, secs);
510
+ return string_format("%dh %02dm %02ds", hrs, mins, secs);
529
511
  } else if (mins > 0) {
530
- return fmt("%dm %02ds", mins, secs);
512
+ return string_format("%dm %02ds", mins, secs);
531
513
  } else {
532
- return fmt("%ds", secs);
514
+ return string_format("%ds", secs);
533
515
  }
534
516
  }
535
517
 
@@ -544,7 +526,7 @@ class HttpClient {
544
526
  }
545
527
  }
546
528
 
547
- return fmt("%.2f %s", dbl_size, suffix[i]);
529
+ return string_format("%.2f %s", dbl_size, suffix[i]);
548
530
  }
549
531
 
550
532
  static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
@@ -578,7 +560,9 @@ class HttpClient {
578
560
  return (now_downloaded_plus_file_size * 100) / total_to_download;
579
561
  }
580
562
 
581
- static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast<long int>(percentage)); }
563
+ static std::string generate_progress_prefix(curl_off_t percentage) {
564
+ return string_format("%3ld%% |", static_cast<long int>(percentage));
565
+ }
582
566
 
583
567
  static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
584
568
  const auto now = std::chrono::steady_clock::now();
@@ -589,9 +573,9 @@ class HttpClient {
589
573
  static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
590
574
  double speed, double estimated_time) {
591
575
  const int width = 10;
592
- return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width,
593
- human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width,
594
- human_readable_time(estimated_time).c_str());
576
+ return string_format("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(),
577
+ width, human_readable_size(total_to_download).c_str(), width,
578
+ human_readable_size(speed).c_str(), width, human_readable_time(estimated_time).c_str());
595
579
  }
596
580
 
597
581
  static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {
@@ -713,8 +697,10 @@ class LlamaData {
713
697
  std::vector<std::string> headers = { "User-Agent: llama-cpp", "Accept: application/json" };
714
698
  std::string url;
715
699
 
700
+ std::string model_endpoint = get_model_endpoint();
701
+
716
702
  if (pos == std::string::npos) {
717
- auto [model_name, manifest_url] = extract_model_and_tag(model, "https://huggingface.co/v2/");
703
+ auto [model_name, manifest_url] = extract_model_and_tag(model, model_endpoint + "v2/");
718
704
  hfr = model_name;
719
705
 
720
706
  nlohmann::json manifest;
@@ -729,7 +715,7 @@ class LlamaData {
729
715
  hff = model.substr(pos + 1);
730
716
  }
731
717
 
732
- url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
718
+ url = model_endpoint + hfr + "/resolve/main/" + hff;
733
719
 
734
720
  return download(url, bn, true, headers);
735
721
  }