@fugood/llama.node 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/LlamaContext.cpp +2 -2
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +82 -54
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +748 -754
- package/src/llama.cpp/common/common.h +49 -41
- package/src/llama.cpp/common/grammar-parser.cpp +10 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +92 -10
- package/src/llama.cpp/common/sampling.h +6 -1
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +29 -17
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
- package/src/llama.cpp/examples/server/server.cpp +33 -25
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +2 -3
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +13 -3
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3715 -2050
- package/src/llama.cpp/ggml-rpc.cpp +1155 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +119 -673
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +203 -224
- package/src/llama.cpp/ggml.c +1208 -1483
- package/src/llama.cpp/ggml.h +71 -46
- package/src/llama.cpp/llama.cpp +1374 -938
- package/src/llama.cpp/llama.h +22 -6
- package/src/llama.cpp/requirements.txt +0 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
|
@@ -44,9 +44,9 @@ static void write_logfile(
|
|
|
44
44
|
return;
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
-
const std::string timestamp =
|
|
47
|
+
const std::string timestamp = string_get_sortable_timestamp();
|
|
48
48
|
|
|
49
|
-
const bool success =
|
|
49
|
+
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
50
50
|
if (!success) {
|
|
51
51
|
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
|
52
52
|
__func__, params.logdir.c_str());
|
|
@@ -64,7 +64,7 @@ static void write_logfile(
|
|
|
64
64
|
fprintf(logfile, "binary: main\n");
|
|
65
65
|
char model_desc[128];
|
|
66
66
|
llama_model_desc(model, model_desc, sizeof(model_desc));
|
|
67
|
-
|
|
67
|
+
yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
|
|
68
68
|
|
|
69
69
|
fprintf(logfile, "\n");
|
|
70
70
|
fprintf(logfile, "######################\n");
|
|
@@ -72,9 +72,9 @@ static void write_logfile(
|
|
|
72
72
|
fprintf(logfile, "######################\n");
|
|
73
73
|
fprintf(logfile, "\n");
|
|
74
74
|
|
|
75
|
-
|
|
75
|
+
yaml_dump_vector_float(logfile, "logits", results.logits);
|
|
76
76
|
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
|
77
|
-
|
|
77
|
+
yaml_dump_vector_float(logfile, "probs", results.probs);
|
|
78
78
|
|
|
79
79
|
llama_dump_timing_info_yaml(logfile, ctx);
|
|
80
80
|
fclose(logfile);
|
|
@@ -1425,7 +1425,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1425
1425
|
// Use all tasks
|
|
1426
1426
|
tasks.resize(n_task);
|
|
1427
1427
|
printf("%s: reading tasks", __func__);
|
|
1428
|
-
int n_dot = n_task/100;
|
|
1428
|
+
int n_dot = std::max((int) n_task/100, 1);
|
|
1429
1429
|
int i = 0;
|
|
1430
1430
|
for (auto& task : tasks) {
|
|
1431
1431
|
++i;
|
|
@@ -1675,7 +1675,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1675
1675
|
|
|
1676
1676
|
llama_batch_free(batch);
|
|
1677
1677
|
|
|
1678
|
-
if (n_done < 100) return;
|
|
1678
|
+
if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
|
|
1679
1679
|
|
|
1680
1680
|
float p = 1.f*n_correct/n_done;
|
|
1681
1681
|
float sigma = sqrt(p*(1-p)/(n_done-1));
|
|
@@ -2007,7 +2007,7 @@ int main(int argc, char ** argv) {
|
|
|
2007
2007
|
|
|
2008
2008
|
std::mt19937 rng(params.seed);
|
|
2009
2009
|
if (params.random_prompt) {
|
|
2010
|
-
params.prompt =
|
|
2010
|
+
params.prompt = string_random_prompt(rng);
|
|
2011
2011
|
}
|
|
2012
2012
|
|
|
2013
2013
|
llama_backend_init();
|
|
@@ -2035,7 +2035,7 @@ int main(int argc, char ** argv) {
|
|
|
2035
2035
|
// print system information
|
|
2036
2036
|
{
|
|
2037
2037
|
fprintf(stderr, "\n");
|
|
2038
|
-
fprintf(stderr, "%s\n",
|
|
2038
|
+
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
|
2039
2039
|
}
|
|
2040
2040
|
|
|
2041
2041
|
struct results_perplexity results;
|
|
@@ -259,7 +259,7 @@ int main(int argc, char ** argv) {
|
|
|
259
259
|
usage(argv[0]);
|
|
260
260
|
}
|
|
261
261
|
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
|
|
262
|
-
if (arg_idx == argc-1 || !
|
|
262
|
+
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
|
|
263
263
|
usage(argv[0]);
|
|
264
264
|
}
|
|
265
265
|
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
|
@@ -284,7 +284,7 @@ int main(int argc, char ** argv) {
|
|
|
284
284
|
} else {
|
|
285
285
|
usage(argv[0]);
|
|
286
286
|
}
|
|
287
|
-
} else if (strcmp(argv[arg_idx], "--keep-split")) {
|
|
287
|
+
} else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
|
|
288
288
|
params.keep_split = true;
|
|
289
289
|
} else {
|
|
290
290
|
usage(argv[0]);
|
|
@@ -11,7 +11,7 @@ struct retrieval_params {
|
|
|
11
11
|
};
|
|
12
12
|
|
|
13
13
|
static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
|
|
14
|
-
|
|
14
|
+
gpt_params_print_usage(argc, argv, gpt_params);
|
|
15
15
|
printf("retrieval options:\n");
|
|
16
16
|
printf(" --context-file FNAME file containing context to embed.\n");
|
|
17
17
|
printf(" specify multiple files by providing --context-file option multiple times.\n");
|
|
@@ -226,7 +226,7 @@ int main(int argc, char ** argv) {
|
|
|
226
226
|
// print system information
|
|
227
227
|
{
|
|
228
228
|
fprintf(stderr, "\n");
|
|
229
|
-
fprintf(stderr, "%s\n",
|
|
229
|
+
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
|
230
230
|
}
|
|
231
231
|
|
|
232
232
|
// max batch size
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
#ifdef GGML_USE_CUDA
|
|
2
|
+
#include "ggml-cuda.h"
|
|
3
|
+
#endif
|
|
4
|
+
|
|
5
|
+
#ifdef GGML_USE_METAL
|
|
6
|
+
#include "ggml-metal.h"
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
#include "ggml-rpc.h"
|
|
10
|
+
#ifdef _WIN32
|
|
11
|
+
# include <windows.h>
|
|
12
|
+
#else
|
|
13
|
+
# include <unistd.h>
|
|
14
|
+
#endif
|
|
15
|
+
#include <string>
|
|
16
|
+
#include <stdio.h>
|
|
17
|
+
|
|
18
|
+
struct rpc_server_params {
|
|
19
|
+
std::string host = "0.0.0.0";
|
|
20
|
+
int port = 50052;
|
|
21
|
+
size_t backend_mem = 0;
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
|
|
25
|
+
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
|
|
26
|
+
fprintf(stderr, "options:\n");
|
|
27
|
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
|
28
|
+
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
|
|
29
|
+
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
|
|
30
|
+
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
|
|
31
|
+
fprintf(stderr, "\n");
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
|
|
35
|
+
std::string arg;
|
|
36
|
+
for (int i = 1; i < argc; i++) {
|
|
37
|
+
arg = argv[i];
|
|
38
|
+
if (arg == "-H" || arg == "--host") {
|
|
39
|
+
if (++i >= argc) {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
params.host = argv[i];
|
|
43
|
+
} else if (arg == "-p" || arg == "--port") {
|
|
44
|
+
if (++i >= argc) {
|
|
45
|
+
return false;
|
|
46
|
+
}
|
|
47
|
+
params.port = std::stoi(argv[i]);
|
|
48
|
+
if (params.port <= 0 || params.port > 65535) {
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
51
|
+
} else if (arg == "-m" || arg == "--mem") {
|
|
52
|
+
if (++i >= argc) {
|
|
53
|
+
return false;
|
|
54
|
+
}
|
|
55
|
+
params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
|
|
56
|
+
} else if (arg == "-h" || arg == "--help") {
|
|
57
|
+
print_usage(argc, argv, params);
|
|
58
|
+
exit(0);
|
|
59
|
+
} else {
|
|
60
|
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
|
61
|
+
print_usage(argc, argv, params);
|
|
62
|
+
exit(0);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
return true;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
static ggml_backend_t create_backend() {
|
|
69
|
+
ggml_backend_t backend = NULL;
|
|
70
|
+
#ifdef GGML_USE_CUDA
|
|
71
|
+
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
|
72
|
+
backend = ggml_backend_cuda_init(0); // init device 0
|
|
73
|
+
if (!backend) {
|
|
74
|
+
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
|
75
|
+
}
|
|
76
|
+
#elif GGML_USE_METAL
|
|
77
|
+
fprintf(stderr, "%s: using Metal backend\n", __func__);
|
|
78
|
+
backend = ggml_backend_metal_init();
|
|
79
|
+
if (!backend) {
|
|
80
|
+
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
|
81
|
+
}
|
|
82
|
+
#endif
|
|
83
|
+
|
|
84
|
+
// if there aren't GPU Backends fallback to CPU backend
|
|
85
|
+
if (!backend) {
|
|
86
|
+
fprintf(stderr, "%s: using CPU backend\n", __func__);
|
|
87
|
+
backend = ggml_backend_cpu_init();
|
|
88
|
+
}
|
|
89
|
+
return backend;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
|
93
|
+
#ifdef GGML_USE_CUDA
|
|
94
|
+
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
|
95
|
+
#else
|
|
96
|
+
#ifdef _WIN32
|
|
97
|
+
MEMORYSTATUSEX status;
|
|
98
|
+
status.dwLength = sizeof(status);
|
|
99
|
+
GlobalMemoryStatusEx(&status);
|
|
100
|
+
*total_mem = status.ullTotalPhys;
|
|
101
|
+
*free_mem = status.ullAvailPhys;
|
|
102
|
+
#else
|
|
103
|
+
long pages = sysconf(_SC_PHYS_PAGES);
|
|
104
|
+
long page_size = sysconf(_SC_PAGE_SIZE);
|
|
105
|
+
*total_mem = pages * page_size;
|
|
106
|
+
*free_mem = *total_mem;
|
|
107
|
+
#endif
|
|
108
|
+
#endif
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
int main(int argc, char * argv[]) {
|
|
112
|
+
rpc_server_params params;
|
|
113
|
+
if (!rpc_server_params_parse(argc, argv, params)) {
|
|
114
|
+
fprintf(stderr, "Invalid parameters\n");
|
|
115
|
+
return 1;
|
|
116
|
+
}
|
|
117
|
+
ggml_backend_t backend = create_backend();
|
|
118
|
+
if (!backend) {
|
|
119
|
+
fprintf(stderr, "Failed to create backend\n");
|
|
120
|
+
return 1;
|
|
121
|
+
}
|
|
122
|
+
std::string endpoint = params.host + ":" + std::to_string(params.port);
|
|
123
|
+
size_t free_mem, total_mem;
|
|
124
|
+
if (params.backend_mem > 0) {
|
|
125
|
+
free_mem = params.backend_mem;
|
|
126
|
+
total_mem = params.backend_mem;
|
|
127
|
+
} else {
|
|
128
|
+
get_backend_memory(&free_mem, &total_mem);
|
|
129
|
+
}
|
|
130
|
+
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
|
131
|
+
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
|
132
|
+
ggml_backend_free(backend);
|
|
133
|
+
return 0;
|
|
134
|
+
}
|
|
@@ -102,7 +102,6 @@ struct slot_params {
|
|
|
102
102
|
bool stream = true;
|
|
103
103
|
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
|
|
104
104
|
|
|
105
|
-
uint32_t seed = -1; // RNG seed
|
|
106
105
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
107
106
|
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
|
108
107
|
int32_t n_predict = -1; // new tokens to predict
|
|
@@ -651,9 +650,6 @@ struct server_context {
|
|
|
651
650
|
std::string system_prompt;
|
|
652
651
|
std::vector<llama_token> system_tokens;
|
|
653
652
|
|
|
654
|
-
std::string name_user; // this should be the antiprompt
|
|
655
|
-
std::string name_assistant;
|
|
656
|
-
|
|
657
653
|
// slots / clients
|
|
658
654
|
std::vector<server_slot> slots;
|
|
659
655
|
json default_generation_settings_for_props;
|
|
@@ -673,6 +669,15 @@ struct server_context {
|
|
|
673
669
|
llama_free_model(model);
|
|
674
670
|
model = nullptr;
|
|
675
671
|
}
|
|
672
|
+
|
|
673
|
+
// Clear any sampling context
|
|
674
|
+
for (server_slot & slot : slots) {
|
|
675
|
+
if (slot.ctx_sampling != nullptr) {
|
|
676
|
+
llama_sampling_free(slot.ctx_sampling);
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
llama_batch_free(batch);
|
|
676
681
|
}
|
|
677
682
|
|
|
678
683
|
bool load_model(const gpt_params & params_) {
|
|
@@ -1014,7 +1019,7 @@ struct server_context {
|
|
|
1014
1019
|
sampler_names.emplace_back(sampler_name);
|
|
1015
1020
|
}
|
|
1016
1021
|
}
|
|
1017
|
-
slot.sparams.samplers_sequence =
|
|
1022
|
+
slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
|
|
1018
1023
|
} else {
|
|
1019
1024
|
slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
|
|
1020
1025
|
}
|
|
@@ -1098,15 +1103,11 @@ struct server_context {
|
|
|
1098
1103
|
system_need_update = false;
|
|
1099
1104
|
}
|
|
1100
1105
|
|
|
1101
|
-
|
|
1102
|
-
system_prompt
|
|
1103
|
-
name_user = sys_props.value("anti_prompt", "");
|
|
1104
|
-
name_assistant = sys_props.value("assistant_name", "");
|
|
1106
|
+
bool system_prompt_set(const std::string & sys_prompt) {
|
|
1107
|
+
system_prompt = sys_prompt;
|
|
1105
1108
|
|
|
1106
1109
|
LOG_VERBOSE("system prompt process", {
|
|
1107
1110
|
{"system_prompt", system_prompt},
|
|
1108
|
-
{"name_user", name_user},
|
|
1109
|
-
{"name_assistant", name_assistant},
|
|
1110
1111
|
});
|
|
1111
1112
|
|
|
1112
1113
|
// release all slots
|
|
@@ -1115,6 +1116,7 @@ struct server_context {
|
|
|
1115
1116
|
}
|
|
1116
1117
|
|
|
1117
1118
|
system_need_update = true;
|
|
1119
|
+
return true;
|
|
1118
1120
|
}
|
|
1119
1121
|
|
|
1120
1122
|
bool process_token(completion_token_output & result, server_slot & slot) {
|
|
@@ -1254,14 +1256,14 @@ struct server_context {
|
|
|
1254
1256
|
std::vector<std::string> samplers_sequence;
|
|
1255
1257
|
samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
|
|
1256
1258
|
for (const auto & sampler_type : slot.sparams.samplers_sequence) {
|
|
1257
|
-
samplers_sequence.emplace_back(
|
|
1259
|
+
samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
|
|
1258
1260
|
}
|
|
1259
1261
|
|
|
1260
1262
|
return json {
|
|
1261
1263
|
{"n_ctx", slot.n_ctx},
|
|
1262
1264
|
{"n_predict", slot.n_predict},
|
|
1263
1265
|
{"model", params.model_alias},
|
|
1264
|
-
{"seed", slot.
|
|
1266
|
+
{"seed", slot.sparams.seed},
|
|
1265
1267
|
{"temperature", slot.sparams.temp},
|
|
1266
1268
|
{"dynatemp_range", slot.sparams.dynatemp_range},
|
|
1267
1269
|
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
|
|
@@ -1534,7 +1536,8 @@ struct server_context {
|
|
|
1534
1536
|
}
|
|
1535
1537
|
|
|
1536
1538
|
if (task.data.contains("system_prompt")) {
|
|
1537
|
-
|
|
1539
|
+
std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
|
|
1540
|
+
system_prompt_set(sys_prompt);
|
|
1538
1541
|
|
|
1539
1542
|
for (server_slot & slot : slots) {
|
|
1540
1543
|
slot.n_past = 0;
|
|
@@ -1978,8 +1981,7 @@ struct server_context {
|
|
|
1978
1981
|
slot.state = SLOT_STATE_PROCESSING;
|
|
1979
1982
|
slot.command = SLOT_COMMAND_NONE;
|
|
1980
1983
|
slot.release();
|
|
1981
|
-
slot.
|
|
1982
|
-
send_final_response(slot);
|
|
1984
|
+
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
|
|
1983
1985
|
continue;
|
|
1984
1986
|
}
|
|
1985
1987
|
} else {
|
|
@@ -2270,10 +2272,10 @@ struct server_context {
|
|
|
2270
2272
|
|
|
2271
2273
|
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
|
|
2272
2274
|
if (n_probs > 0) {
|
|
2273
|
-
const size_t
|
|
2275
|
+
const size_t n_valid = slot.ctx_sampling->n_valid;
|
|
2274
2276
|
|
|
2275
2277
|
// Make sure at least n_probs top tokens are at the front of the vector:
|
|
2276
|
-
if (slot.sparams.temp == 0.0f && n_probs >
|
|
2278
|
+
if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
|
|
2277
2279
|
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
|
|
2278
2280
|
}
|
|
2279
2281
|
|
|
@@ -2289,7 +2291,7 @@ struct server_context {
|
|
|
2289
2291
|
for (size_t i = 0; i < n_probs; ++i) {
|
|
2290
2292
|
result.probs.push_back({
|
|
2291
2293
|
cur_p.data[i].id,
|
|
2292
|
-
i >=
|
|
2294
|
+
i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
|
|
2293
2295
|
});
|
|
2294
2296
|
}
|
|
2295
2297
|
}
|
|
@@ -2383,6 +2385,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
|
|
2383
2385
|
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
|
2384
2386
|
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
|
2385
2387
|
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
|
2388
|
+
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
|
2386
2389
|
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
|
|
2387
2390
|
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
|
2388
2391
|
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
|
|
@@ -2435,6 +2438,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|
|
2435
2438
|
break;
|
|
2436
2439
|
}
|
|
2437
2440
|
sparams.port = std::stoi(argv[i]);
|
|
2441
|
+
} else if (arg == "--rpc") {
|
|
2442
|
+
if (++i >= argc) {
|
|
2443
|
+
invalid_param = true;
|
|
2444
|
+
break;
|
|
2445
|
+
}
|
|
2446
|
+
params.rpc_servers = argv[i];
|
|
2438
2447
|
} else if (arg == "--host") {
|
|
2439
2448
|
if (++i >= argc) {
|
|
2440
2449
|
invalid_param = true;
|
|
@@ -2843,7 +2852,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|
|
2843
2852
|
invalid_param = true;
|
|
2844
2853
|
break;
|
|
2845
2854
|
}
|
|
2846
|
-
if (!
|
|
2855
|
+
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
|
2847
2856
|
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
|
2848
2857
|
invalid_param = true;
|
|
2849
2858
|
break;
|
|
@@ -2918,7 +2927,7 @@ int main(int argc, char ** argv) {
|
|
|
2918
2927
|
server_params_parse(argc, argv, sparams, params);
|
|
2919
2928
|
|
|
2920
2929
|
if (!sparams.system_prompt.empty()) {
|
|
2921
|
-
ctx_server.system_prompt_set(
|
|
2930
|
+
ctx_server.system_prompt_set(sparams.system_prompt);
|
|
2922
2931
|
}
|
|
2923
2932
|
|
|
2924
2933
|
if (params.model_alias == "unknown") {
|
|
@@ -3301,7 +3310,7 @@ int main(int argc, char ** argv) {
|
|
|
3301
3310
|
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
|
3302
3311
|
json request_data = json::parse(req.body);
|
|
3303
3312
|
std::string filename = request_data.at("filename");
|
|
3304
|
-
if (!
|
|
3313
|
+
if (!fs_validate_filename(filename)) {
|
|
3305
3314
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
|
3306
3315
|
return;
|
|
3307
3316
|
}
|
|
@@ -3331,7 +3340,7 @@ int main(int argc, char ** argv) {
|
|
|
3331
3340
|
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
|
3332
3341
|
json request_data = json::parse(req.body);
|
|
3333
3342
|
std::string filename = request_data.at("filename");
|
|
3334
|
-
if (!
|
|
3343
|
+
if (!fs_validate_filename(filename)) {
|
|
3335
3344
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
|
3336
3345
|
return;
|
|
3337
3346
|
}
|
|
@@ -3407,8 +3416,7 @@ int main(int argc, char ** argv) {
|
|
|
3407
3416
|
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
|
3408
3417
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
3409
3418
|
json data = {
|
|
3410
|
-
{ "
|
|
3411
|
-
{ "assistant_name", ctx_server.name_assistant.c_str() },
|
|
3419
|
+
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
|
3412
3420
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
|
3413
3421
|
{ "total_slots", ctx_server.params.n_parallel }
|
|
3414
3422
|
};
|
|
@@ -371,7 +371,7 @@ static json oaicompat_completion_params_parse(
|
|
|
371
371
|
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
|
372
372
|
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
|
373
373
|
llama_params["stream"] = json_value(body, "stream", false);
|
|
374
|
-
llama_params["temperature"] = json_value(body, "temperature",
|
|
374
|
+
llama_params["temperature"] = json_value(body, "temperature", 1.0);
|
|
375
375
|
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
|
376
376
|
|
|
377
377
|
// Apply chat template to the list of messages
|