@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
#if defined(_MSC_VER)
|
|
2
|
+
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
|
3
|
+
#endif
|
|
4
|
+
|
|
1
5
|
#include "ggml-cpu.h"
|
|
2
6
|
|
|
3
7
|
#ifdef GGML_USE_CUDA
|
|
@@ -18,26 +22,149 @@
|
|
|
18
22
|
|
|
19
23
|
#include "ggml-rpc.h"
|
|
20
24
|
#ifdef _WIN32
|
|
25
|
+
# define NOMINMAX
|
|
26
|
+
# define DIRECTORY_SEPARATOR '\\'
|
|
27
|
+
# include <locale>
|
|
21
28
|
# include <windows.h>
|
|
29
|
+
# include <fcntl.h>
|
|
30
|
+
# include <io.h>
|
|
22
31
|
#else
|
|
32
|
+
# define DIRECTORY_SEPARATOR '/'
|
|
23
33
|
# include <unistd.h>
|
|
34
|
+
# include <sys/stat.h>
|
|
24
35
|
#endif
|
|
36
|
+
#include <codecvt>
|
|
25
37
|
#include <string>
|
|
26
38
|
#include <stdio.h>
|
|
39
|
+
#include <vector>
|
|
40
|
+
#include <filesystem>
|
|
41
|
+
#include <algorithm>
|
|
42
|
+
#include <thread>
|
|
43
|
+
|
|
44
|
+
namespace fs = std::filesystem;
|
|
45
|
+
|
|
46
|
+
// NOTE: this is copied from common.cpp to avoid linking with libcommon
|
|
47
|
+
// returns true if successful, false otherwise
|
|
48
|
+
static bool fs_create_directory_with_parents(const std::string & path) {
|
|
49
|
+
#ifdef _WIN32
|
|
50
|
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
|
51
|
+
std::wstring wpath = converter.from_bytes(path);
|
|
52
|
+
|
|
53
|
+
// if the path already exists, check whether it's a directory
|
|
54
|
+
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
|
55
|
+
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
|
56
|
+
return true;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
size_t pos_slash = 0;
|
|
60
|
+
|
|
61
|
+
// process path from front to back, procedurally creating directories
|
|
62
|
+
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
|
63
|
+
const std::wstring subpath = wpath.substr(0, pos_slash);
|
|
64
|
+
const wchar_t * test = subpath.c_str();
|
|
65
|
+
|
|
66
|
+
const bool success = CreateDirectoryW(test, NULL);
|
|
67
|
+
if (!success) {
|
|
68
|
+
const DWORD error = GetLastError();
|
|
69
|
+
|
|
70
|
+
// if the path already exists, ensure that it's a directory
|
|
71
|
+
if (error == ERROR_ALREADY_EXISTS) {
|
|
72
|
+
const DWORD attributes = GetFileAttributesW(subpath.c_str());
|
|
73
|
+
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
|
|
74
|
+
return false;
|
|
75
|
+
}
|
|
76
|
+
} else {
|
|
77
|
+
return false;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
pos_slash += 1;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return true;
|
|
85
|
+
#else
|
|
86
|
+
// if the path already exists, check whether it's a directory
|
|
87
|
+
struct stat info;
|
|
88
|
+
if (stat(path.c_str(), &info) == 0) {
|
|
89
|
+
return S_ISDIR(info.st_mode);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
size_t pos_slash = 1; // skip leading slashes for directory creation
|
|
93
|
+
|
|
94
|
+
// process path from front to back, procedurally creating directories
|
|
95
|
+
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
|
|
96
|
+
const std::string subpath = path.substr(0, pos_slash);
|
|
97
|
+
struct stat info;
|
|
98
|
+
|
|
99
|
+
// if the path already exists, ensure that it's a directory
|
|
100
|
+
if (stat(subpath.c_str(), &info) == 0) {
|
|
101
|
+
if (!S_ISDIR(info.st_mode)) {
|
|
102
|
+
return false;
|
|
103
|
+
}
|
|
104
|
+
} else {
|
|
105
|
+
// create parent directories
|
|
106
|
+
const int ret = mkdir(subpath.c_str(), 0755);
|
|
107
|
+
if (ret != 0) {
|
|
108
|
+
return false;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
pos_slash += 1;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return true;
|
|
116
|
+
#endif // _WIN32
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// NOTE: this is copied from common.cpp to avoid linking with libcommon
|
|
120
|
+
static std::string fs_get_cache_directory() {
|
|
121
|
+
std::string cache_directory = "";
|
|
122
|
+
auto ensure_trailing_slash = [](std::string p) {
|
|
123
|
+
// Make sure to add trailing slash
|
|
124
|
+
if (p.back() != DIRECTORY_SEPARATOR) {
|
|
125
|
+
p += DIRECTORY_SEPARATOR;
|
|
126
|
+
}
|
|
127
|
+
return p;
|
|
128
|
+
};
|
|
129
|
+
if (getenv("LLAMA_CACHE")) {
|
|
130
|
+
cache_directory = std::getenv("LLAMA_CACHE");
|
|
131
|
+
} else {
|
|
132
|
+
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
|
|
133
|
+
if (std::getenv("XDG_CACHE_HOME")) {
|
|
134
|
+
cache_directory = std::getenv("XDG_CACHE_HOME");
|
|
135
|
+
} else {
|
|
136
|
+
cache_directory = std::getenv("HOME") + std::string("/.cache/");
|
|
137
|
+
}
|
|
138
|
+
#elif defined(__APPLE__)
|
|
139
|
+
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
|
140
|
+
#elif defined(_WIN32)
|
|
141
|
+
cache_directory = std::getenv("LOCALAPPDATA");
|
|
142
|
+
#else
|
|
143
|
+
# error Unknown architecture
|
|
144
|
+
#endif
|
|
145
|
+
cache_directory = ensure_trailing_slash(cache_directory);
|
|
146
|
+
cache_directory += "llama.cpp";
|
|
147
|
+
}
|
|
148
|
+
return ensure_trailing_slash(cache_directory);
|
|
149
|
+
}
|
|
27
150
|
|
|
28
151
|
struct rpc_server_params {
|
|
29
152
|
std::string host = "127.0.0.1";
|
|
30
153
|
int port = 50052;
|
|
31
154
|
size_t backend_mem = 0;
|
|
155
|
+
bool use_cache = false;
|
|
156
|
+
int n_threads = std::max(1U, std::thread::hardware_concurrency()/2);
|
|
32
157
|
};
|
|
33
158
|
|
|
34
159
|
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
|
|
35
160
|
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
|
|
36
161
|
fprintf(stderr, "options:\n");
|
|
37
|
-
fprintf(stderr, " -h, --help
|
|
38
|
-
fprintf(stderr, " -
|
|
39
|
-
fprintf(stderr, " -
|
|
40
|
-
fprintf(stderr, " -
|
|
162
|
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
|
163
|
+
fprintf(stderr, " -t, --threads number of threads for the CPU backend (default: %d)\n", params.n_threads);
|
|
164
|
+
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
|
|
165
|
+
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
|
|
166
|
+
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
|
|
167
|
+
fprintf(stderr, " -c, --cache enable local file cache\n");
|
|
41
168
|
fprintf(stderr, "\n");
|
|
42
169
|
}
|
|
43
170
|
|
|
@@ -50,6 +177,15 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
|
|
|
50
177
|
return false;
|
|
51
178
|
}
|
|
52
179
|
params.host = argv[i];
|
|
180
|
+
} else if (arg == "-t" || arg == "--threads") {
|
|
181
|
+
if (++i >= argc) {
|
|
182
|
+
return false;
|
|
183
|
+
}
|
|
184
|
+
params.n_threads = std::stoi(argv[i]);
|
|
185
|
+
if (params.n_threads <= 0) {
|
|
186
|
+
fprintf(stderr, "error: invalid number of threads: %d\n", params.n_threads);
|
|
187
|
+
return false;
|
|
188
|
+
}
|
|
53
189
|
} else if (arg == "-p" || arg == "--port") {
|
|
54
190
|
if (++i >= argc) {
|
|
55
191
|
return false;
|
|
@@ -58,6 +194,8 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
|
|
|
58
194
|
if (params.port <= 0 || params.port > 65535) {
|
|
59
195
|
return false;
|
|
60
196
|
}
|
|
197
|
+
} else if (arg == "-c" || arg == "--cache") {
|
|
198
|
+
params.use_cache = true;
|
|
61
199
|
} else if (arg == "-m" || arg == "--mem") {
|
|
62
200
|
if (++i >= argc) {
|
|
63
201
|
return false;
|
|
@@ -75,7 +213,7 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
|
|
|
75
213
|
return true;
|
|
76
214
|
}
|
|
77
215
|
|
|
78
|
-
static ggml_backend_t create_backend() {
|
|
216
|
+
static ggml_backend_t create_backend(const rpc_server_params & params) {
|
|
79
217
|
ggml_backend_t backend = NULL;
|
|
80
218
|
#ifdef GGML_USE_CUDA
|
|
81
219
|
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
|
@@ -107,6 +245,7 @@ static ggml_backend_t create_backend() {
|
|
|
107
245
|
if (!backend) {
|
|
108
246
|
fprintf(stderr, "%s: using CPU backend\n", __func__);
|
|
109
247
|
backend = ggml_backend_cpu_init();
|
|
248
|
+
ggml_backend_cpu_set_n_threads(backend, params.n_threads);
|
|
110
249
|
}
|
|
111
250
|
return backend;
|
|
112
251
|
}
|
|
@@ -151,7 +290,7 @@ int main(int argc, char * argv[]) {
|
|
|
151
290
|
fprintf(stderr, "\n");
|
|
152
291
|
}
|
|
153
292
|
|
|
154
|
-
ggml_backend_t backend = create_backend();
|
|
293
|
+
ggml_backend_t backend = create_backend(params);
|
|
155
294
|
if (!backend) {
|
|
156
295
|
fprintf(stderr, "Failed to create backend\n");
|
|
157
296
|
return 1;
|
|
@@ -164,8 +303,24 @@ int main(int argc, char * argv[]) {
|
|
|
164
303
|
} else {
|
|
165
304
|
get_backend_memory(&free_mem, &total_mem);
|
|
166
305
|
}
|
|
167
|
-
|
|
168
|
-
|
|
306
|
+
const char * cache_dir = nullptr;
|
|
307
|
+
std::string cache_dir_str;
|
|
308
|
+
if (params.use_cache) {
|
|
309
|
+
cache_dir_str = fs_get_cache_directory() + "rpc/";
|
|
310
|
+
if (!fs_create_directory_with_parents(cache_dir_str)) {
|
|
311
|
+
fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
|
|
312
|
+
return 1;
|
|
313
|
+
}
|
|
314
|
+
cache_dir = cache_dir_str.c_str();
|
|
315
|
+
}
|
|
316
|
+
printf("Starting RPC server v%d.%d.%d\n",
|
|
317
|
+
RPC_PROTO_MAJOR_VERSION,
|
|
318
|
+
RPC_PROTO_MINOR_VERSION,
|
|
319
|
+
RPC_PROTO_PATCH_VERSION);
|
|
320
|
+
printf(" endpoint : %s\n", endpoint.c_str());
|
|
321
|
+
printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a");
|
|
322
|
+
printf(" backend memory : %zu MB\n", free_mem / (1024 * 1024));
|
|
323
|
+
ggml_backend_rpc_start_server(backend, endpoint.c_str(), cache_dir, free_mem, total_mem);
|
|
169
324
|
ggml_backend_free(backend);
|
|
170
325
|
return 0;
|
|
171
326
|
}
|
|
@@ -1,5 +1,16 @@
|
|
|
1
1
|
set(TARGET llama-run)
|
|
2
2
|
add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
|
|
3
|
+
|
|
4
|
+
# TODO: avoid copying this code block from common/CMakeLists.txt
|
|
5
|
+
set(LLAMA_RUN_EXTRA_LIBS "")
|
|
6
|
+
if (LLAMA_CURL)
|
|
7
|
+
find_package(CURL REQUIRED)
|
|
8
|
+
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
|
9
|
+
include_directories(${CURL_INCLUDE_DIRS})
|
|
10
|
+
find_library(CURL_LIBRARY curl REQUIRED)
|
|
11
|
+
set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARY})
|
|
12
|
+
endif ()
|
|
13
|
+
|
|
3
14
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
|
-
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
15
|
+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
|
|
5
16
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -38,24 +38,6 @@
|
|
|
38
38
|
}
|
|
39
39
|
#endif
|
|
40
40
|
|
|
41
|
-
GGML_ATTRIBUTE_FORMAT(1, 2)
|
|
42
|
-
static std::string fmt(const char * fmt, ...) {
|
|
43
|
-
va_list ap;
|
|
44
|
-
va_list ap2;
|
|
45
|
-
va_start(ap, fmt);
|
|
46
|
-
va_copy(ap2, ap);
|
|
47
|
-
const int size = vsnprintf(NULL, 0, fmt, ap);
|
|
48
|
-
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
49
|
-
std::string buf;
|
|
50
|
-
buf.resize(size);
|
|
51
|
-
const int size2 = vsnprintf(const_cast<char *>(buf.data()), buf.size() + 1, fmt, ap2);
|
|
52
|
-
GGML_ASSERT(size2 == size);
|
|
53
|
-
va_end(ap2);
|
|
54
|
-
va_end(ap);
|
|
55
|
-
|
|
56
|
-
return buf;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
41
|
GGML_ATTRIBUTE_FORMAT(1, 2)
|
|
60
42
|
static int printe(const char * fmt, ...) {
|
|
61
43
|
va_list args;
|
|
@@ -525,11 +507,11 @@ class HttpClient {
|
|
|
525
507
|
int secs = static_cast<int>(seconds) % 60;
|
|
526
508
|
|
|
527
509
|
if (hrs > 0) {
|
|
528
|
-
return
|
|
510
|
+
return string_format("%dh %02dm %02ds", hrs, mins, secs);
|
|
529
511
|
} else if (mins > 0) {
|
|
530
|
-
return
|
|
512
|
+
return string_format("%dm %02ds", mins, secs);
|
|
531
513
|
} else {
|
|
532
|
-
return
|
|
514
|
+
return string_format("%ds", secs);
|
|
533
515
|
}
|
|
534
516
|
}
|
|
535
517
|
|
|
@@ -544,7 +526,7 @@ class HttpClient {
|
|
|
544
526
|
}
|
|
545
527
|
}
|
|
546
528
|
|
|
547
|
-
return
|
|
529
|
+
return string_format("%.2f %s", dbl_size, suffix[i]);
|
|
548
530
|
}
|
|
549
531
|
|
|
550
532
|
static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
|
|
@@ -578,7 +560,9 @@ class HttpClient {
|
|
|
578
560
|
return (now_downloaded_plus_file_size * 100) / total_to_download;
|
|
579
561
|
}
|
|
580
562
|
|
|
581
|
-
static std::string generate_progress_prefix(curl_off_t percentage) {
|
|
563
|
+
static std::string generate_progress_prefix(curl_off_t percentage) {
|
|
564
|
+
return string_format("%3ld%% |", static_cast<long int>(percentage));
|
|
565
|
+
}
|
|
582
566
|
|
|
583
567
|
static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
|
|
584
568
|
const auto now = std::chrono::steady_clock::now();
|
|
@@ -589,9 +573,9 @@ class HttpClient {
|
|
|
589
573
|
static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
|
|
590
574
|
double speed, double estimated_time) {
|
|
591
575
|
const int width = 10;
|
|
592
|
-
return
|
|
593
|
-
|
|
594
|
-
|
|
576
|
+
return string_format("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(),
|
|
577
|
+
width, human_readable_size(total_to_download).c_str(), width,
|
|
578
|
+
human_readable_size(speed).c_str(), width, human_readable_time(estimated_time).c_str());
|
|
595
579
|
}
|
|
596
580
|
|
|
597
581
|
static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {
|
|
@@ -713,8 +697,10 @@ class LlamaData {
|
|
|
713
697
|
std::vector<std::string> headers = { "User-Agent: llama-cpp", "Accept: application/json" };
|
|
714
698
|
std::string url;
|
|
715
699
|
|
|
700
|
+
std::string model_endpoint = get_model_endpoint();
|
|
701
|
+
|
|
716
702
|
if (pos == std::string::npos) {
|
|
717
|
-
auto [model_name, manifest_url] = extract_model_and_tag(model, "
|
|
703
|
+
auto [model_name, manifest_url] = extract_model_and_tag(model, model_endpoint + "v2/");
|
|
718
704
|
hfr = model_name;
|
|
719
705
|
|
|
720
706
|
nlohmann::json manifest;
|
|
@@ -729,7 +715,7 @@ class LlamaData {
|
|
|
729
715
|
hff = model.substr(pos + 1);
|
|
730
716
|
}
|
|
731
717
|
|
|
732
|
-
url =
|
|
718
|
+
url = model_endpoint + hfr + "/resolve/main/" + hff;
|
|
733
719
|
|
|
734
720
|
return download(url, bn, true, headers);
|
|
735
721
|
}
|