@fugood/llama.node 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/CMakeLists.txt +72 -46
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +732 -752
- package/src/llama.cpp/common/common.h +47 -41
- package/src/llama.cpp/common/grammar-parser.cpp +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +89 -7
- package/src/llama.cpp/common/sampling.h +5 -0
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +24 -16
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
- package/src/llama.cpp/examples/server/server.cpp +21 -9
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +0 -1
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +4 -0
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3700 -2041
- package/src/llama.cpp/ggml-rpc.cpp +188 -56
- package/src/llama.cpp/ggml-sycl.cpp +99 -530
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +202 -225
- package/src/llama.cpp/ggml.c +1034 -1154
- package/src/llama.cpp/ggml.h +59 -31
- package/src/llama.cpp/llama.cpp +859 -609
- package/src/llama.cpp/llama.h +19 -6
- package/src/llama.cpp/requirements.txt +0 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
|
@@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
|
|
88
88
|
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
|
89
89
|
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
|
90
90
|
struct {
|
|
91
|
-
struct ggml_tensor * newline;
|
|
92
91
|
struct ggml_context * ctx;
|
|
93
92
|
} model;
|
|
94
93
|
|
|
@@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|
|
150
149
|
|
|
151
150
|
model.ctx = ggml_init(params);
|
|
152
151
|
|
|
153
|
-
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
|
|
154
|
-
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
|
155
|
-
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
|
|
156
|
-
if (newline_tmp->buffer == NULL) {
|
|
157
|
-
LOG_TEE("newline_tmp tensor buffer is NULL\n");
|
|
158
|
-
}
|
|
159
|
-
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
|
|
160
|
-
} else {
|
|
161
|
-
model.newline->data = newline_tmp->data;
|
|
162
|
-
if (model.newline->data == NULL) {
|
|
163
|
-
LOG_TEE("newline_tmp tensor data is NULL\n");
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
|
|
167
152
|
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
|
|
168
153
|
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
|
169
154
|
// fill it with the image embeddings, ignoring the base
|
|
@@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
|
|
|
174
174
|
// debug
|
|
175
175
|
if (dump_kv_cache) {
|
|
176
176
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
177
|
-
|
|
177
|
+
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
178
178
|
}
|
|
179
179
|
|
|
180
180
|
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
|
@@ -121,7 +121,7 @@ int main(int argc, char ** argv){
|
|
|
121
121
|
// debug
|
|
122
122
|
if (dump_kv_cache) {
|
|
123
123
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
124
|
-
|
|
124
|
+
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
125
125
|
}
|
|
126
126
|
|
|
127
127
|
// print current draft sequence
|
|
@@ -60,9 +60,9 @@ static void write_logfile(
|
|
|
60
60
|
return;
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
-
const std::string timestamp =
|
|
63
|
+
const std::string timestamp = string_get_sortable_timestamp();
|
|
64
64
|
|
|
65
|
-
const bool success =
|
|
65
|
+
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
66
66
|
if (!success) {
|
|
67
67
|
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
|
68
68
|
__func__, params.logdir.c_str());
|
|
@@ -80,7 +80,7 @@ static void write_logfile(
|
|
|
80
80
|
fprintf(logfile, "binary: main\n");
|
|
81
81
|
char model_desc[128];
|
|
82
82
|
llama_model_desc(model, model_desc, sizeof(model_desc));
|
|
83
|
-
|
|
83
|
+
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
|
84
84
|
|
|
85
85
|
fprintf(logfile, "\n");
|
|
86
86
|
fprintf(logfile, "######################\n");
|
|
@@ -88,8 +88,8 @@ static void write_logfile(
|
|
|
88
88
|
fprintf(logfile, "######################\n");
|
|
89
89
|
fprintf(logfile, "\n");
|
|
90
90
|
|
|
91
|
-
|
|
92
|
-
|
|
91
|
+
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
|
92
|
+
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
|
93
93
|
|
|
94
94
|
llama_dump_timing_info_yaml(logfile, ctx);
|
|
95
95
|
fclose(logfile);
|
|
@@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
|
|
|
181
181
|
|
|
182
182
|
std::mt19937 rng(params.seed);
|
|
183
183
|
if (params.random_prompt) {
|
|
184
|
-
params.prompt =
|
|
184
|
+
params.prompt = string_random_prompt(rng);
|
|
185
185
|
}
|
|
186
186
|
|
|
187
187
|
LOG("%s: llama backend init\n", __func__);
|
|
@@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
|
|
|
219
219
|
// print system information
|
|
220
220
|
{
|
|
221
221
|
LOG_TEE("\n");
|
|
222
|
-
LOG_TEE("%s\n",
|
|
222
|
+
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
|
223
223
|
}
|
|
224
224
|
|
|
225
225
|
std::string path_session = params.path_prompt_cache;
|
|
@@ -474,12 +474,12 @@ int main(int argc, char ** argv) {
|
|
|
474
474
|
LOG_TEE("\n\n");
|
|
475
475
|
|
|
476
476
|
if (params.interactive) {
|
|
477
|
-
const char *control_message;
|
|
477
|
+
const char * control_message;
|
|
478
478
|
if (params.multiline_input) {
|
|
479
|
-
control_message = " - To return control to
|
|
479
|
+
control_message = " - To return control to the AI, end your input with '\\'.\n"
|
|
480
480
|
" - To return control without starting a new line, end your input with '/'.\n";
|
|
481
481
|
} else {
|
|
482
|
-
control_message = " - Press Return to return control to
|
|
482
|
+
control_message = " - Press Return to return control to the AI.\n"
|
|
483
483
|
" - To return control without starting a new line, end your input with '/'.\n"
|
|
484
484
|
" - If you want to submit another line, end your input with '\\'.\n";
|
|
485
485
|
}
|
|
@@ -707,7 +707,7 @@ int main(int argc, char ** argv) {
|
|
|
707
707
|
|
|
708
708
|
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
|
709
709
|
|
|
710
|
-
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
|
710
|
+
llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
|
|
711
711
|
|
|
712
712
|
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
|
713
713
|
|
|
@@ -728,7 +728,7 @@ int main(int argc, char ** argv) {
|
|
|
728
728
|
|
|
729
729
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
|
730
730
|
// for the prompt, we don't apply grammar rules
|
|
731
|
-
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
|
|
731
|
+
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
|
|
732
732
|
|
|
733
733
|
++n_consumed;
|
|
734
734
|
if ((int) embd.size() >= params.n_batch) {
|
|
@@ -740,18 +740,26 @@ int main(int argc, char ** argv) {
|
|
|
740
740
|
// display text
|
|
741
741
|
if (input_echo && display) {
|
|
742
742
|
for (auto id : embd) {
|
|
743
|
-
const std::string token_str = llama_token_to_piece(ctx, id,
|
|
744
|
-
printf("%s", token_str.c_str());
|
|
743
|
+
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
|
745
744
|
|
|
745
|
+
// Console/Stream Output
|
|
746
|
+
fprintf(stdout, "%s", token_str.c_str());
|
|
747
|
+
|
|
748
|
+
// Record Displayed Tokens To Log
|
|
749
|
+
// Note: Generated tokens are created one by one hence this check
|
|
746
750
|
if (embd.size() > 1) {
|
|
751
|
+
// Incoming Requested Tokens
|
|
747
752
|
input_tokens.push_back(id);
|
|
748
753
|
} else {
|
|
754
|
+
// Outgoing Generated Tokens
|
|
749
755
|
output_tokens.push_back(id);
|
|
750
756
|
output_ss << token_str;
|
|
751
757
|
}
|
|
758
|
+
|
|
759
|
+
fflush(stdout);
|
|
752
760
|
}
|
|
753
|
-
fflush(stdout);
|
|
754
761
|
}
|
|
762
|
+
|
|
755
763
|
// reset color to default if there is no pending user input
|
|
756
764
|
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
|
757
765
|
console::set_display(console::reset);
|
|
@@ -879,7 +887,7 @@ int main(int argc, char ** argv) {
|
|
|
879
887
|
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
|
|
880
888
|
}
|
|
881
889
|
if (params.escape) {
|
|
882
|
-
|
|
890
|
+
string_process_escapes(buffer);
|
|
883
891
|
}
|
|
884
892
|
|
|
885
893
|
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
|
@@ -210,7 +210,7 @@ int main(int argc, char ** argv) {
|
|
|
210
210
|
while (true) {
|
|
211
211
|
if (dump_kv_cache) {
|
|
212
212
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
213
|
-
|
|
213
|
+
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
214
214
|
}
|
|
215
215
|
|
|
216
216
|
llama_batch_clear(batch);
|
|
@@ -44,9 +44,9 @@ static void write_logfile(
|
|
|
44
44
|
return;
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
-
const std::string timestamp =
|
|
47
|
+
const std::string timestamp = string_get_sortable_timestamp();
|
|
48
48
|
|
|
49
|
-
const bool success =
|
|
49
|
+
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
50
50
|
if (!success) {
|
|
51
51
|
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
|
52
52
|
__func__, params.logdir.c_str());
|
|
@@ -64,7 +64,7 @@ static void write_logfile(
|
|
|
64
64
|
fprintf(logfile, "binary: main\n");
|
|
65
65
|
char model_desc[128];
|
|
66
66
|
llama_model_desc(model, model_desc, sizeof(model_desc));
|
|
67
|
-
|
|
67
|
+
yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
|
|
68
68
|
|
|
69
69
|
fprintf(logfile, "\n");
|
|
70
70
|
fprintf(logfile, "######################\n");
|
|
@@ -72,9 +72,9 @@ static void write_logfile(
|
|
|
72
72
|
fprintf(logfile, "######################\n");
|
|
73
73
|
fprintf(logfile, "\n");
|
|
74
74
|
|
|
75
|
-
|
|
75
|
+
yaml_dump_vector_float(logfile, "logits", results.logits);
|
|
76
76
|
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
|
77
|
-
|
|
77
|
+
yaml_dump_vector_float(logfile, "probs", results.probs);
|
|
78
78
|
|
|
79
79
|
llama_dump_timing_info_yaml(logfile, ctx);
|
|
80
80
|
fclose(logfile);
|
|
@@ -1425,7 +1425,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1425
1425
|
// Use all tasks
|
|
1426
1426
|
tasks.resize(n_task);
|
|
1427
1427
|
printf("%s: reading tasks", __func__);
|
|
1428
|
-
int n_dot = n_task/100;
|
|
1428
|
+
int n_dot = std::max((int) n_task/100, 1);
|
|
1429
1429
|
int i = 0;
|
|
1430
1430
|
for (auto& task : tasks) {
|
|
1431
1431
|
++i;
|
|
@@ -1675,7 +1675,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1675
1675
|
|
|
1676
1676
|
llama_batch_free(batch);
|
|
1677
1677
|
|
|
1678
|
-
if (n_done < 100) return;
|
|
1678
|
+
if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
|
|
1679
1679
|
|
|
1680
1680
|
float p = 1.f*n_correct/n_done;
|
|
1681
1681
|
float sigma = sqrt(p*(1-p)/(n_done-1));
|
|
@@ -2007,7 +2007,7 @@ int main(int argc, char ** argv) {
|
|
|
2007
2007
|
|
|
2008
2008
|
std::mt19937 rng(params.seed);
|
|
2009
2009
|
if (params.random_prompt) {
|
|
2010
|
-
params.prompt =
|
|
2010
|
+
params.prompt = string_random_prompt(rng);
|
|
2011
2011
|
}
|
|
2012
2012
|
|
|
2013
2013
|
llama_backend_init();
|
|
@@ -2035,7 +2035,7 @@ int main(int argc, char ** argv) {
|
|
|
2035
2035
|
// print system information
|
|
2036
2036
|
{
|
|
2037
2037
|
fprintf(stderr, "\n");
|
|
2038
|
-
fprintf(stderr, "%s\n",
|
|
2038
|
+
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
|
2039
2039
|
}
|
|
2040
2040
|
|
|
2041
2041
|
struct results_perplexity results;
|
|
@@ -259,7 +259,7 @@ int main(int argc, char ** argv) {
|
|
|
259
259
|
usage(argv[0]);
|
|
260
260
|
}
|
|
261
261
|
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
|
|
262
|
-
if (arg_idx == argc-1 || !
|
|
262
|
+
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
|
|
263
263
|
usage(argv[0]);
|
|
264
264
|
}
|
|
265
265
|
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
|
@@ -284,7 +284,7 @@ int main(int argc, char ** argv) {
|
|
|
284
284
|
} else {
|
|
285
285
|
usage(argv[0]);
|
|
286
286
|
}
|
|
287
|
-
} else if (strcmp(argv[arg_idx], "--keep-split")) {
|
|
287
|
+
} else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
|
|
288
288
|
params.keep_split = true;
|
|
289
289
|
} else {
|
|
290
290
|
usage(argv[0]);
|
|
@@ -11,7 +11,7 @@ struct retrieval_params {
|
|
|
11
11
|
};
|
|
12
12
|
|
|
13
13
|
static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
|
|
14
|
-
|
|
14
|
+
gpt_params_print_usage(argc, argv, gpt_params);
|
|
15
15
|
printf("retrieval options:\n");
|
|
16
16
|
printf(" --context-file FNAME file containing context to embed.\n");
|
|
17
17
|
printf(" specify multiple files by providing --context-file option multiple times.\n");
|
|
@@ -226,7 +226,7 @@ int main(int argc, char ** argv) {
|
|
|
226
226
|
// print system information
|
|
227
227
|
{
|
|
228
228
|
fprintf(stderr, "\n");
|
|
229
|
-
fprintf(stderr, "%s\n",
|
|
229
|
+
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
|
230
230
|
}
|
|
231
231
|
|
|
232
232
|
// max batch size
|
|
@@ -7,9 +7,64 @@
|
|
|
7
7
|
#endif
|
|
8
8
|
|
|
9
9
|
#include "ggml-rpc.h"
|
|
10
|
+
#ifdef _WIN32
|
|
11
|
+
# include <windows.h>
|
|
12
|
+
#else
|
|
13
|
+
# include <unistd.h>
|
|
14
|
+
#endif
|
|
10
15
|
#include <string>
|
|
11
16
|
#include <stdio.h>
|
|
12
17
|
|
|
18
|
+
struct rpc_server_params {
|
|
19
|
+
std::string host = "0.0.0.0";
|
|
20
|
+
int port = 50052;
|
|
21
|
+
size_t backend_mem = 0;
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
|
|
25
|
+
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
|
|
26
|
+
fprintf(stderr, "options:\n");
|
|
27
|
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
|
28
|
+
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
|
|
29
|
+
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
|
|
30
|
+
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
|
|
31
|
+
fprintf(stderr, "\n");
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
|
|
35
|
+
std::string arg;
|
|
36
|
+
for (int i = 1; i < argc; i++) {
|
|
37
|
+
arg = argv[i];
|
|
38
|
+
if (arg == "-H" || arg == "--host") {
|
|
39
|
+
if (++i >= argc) {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
params.host = argv[i];
|
|
43
|
+
} else if (arg == "-p" || arg == "--port") {
|
|
44
|
+
if (++i >= argc) {
|
|
45
|
+
return false;
|
|
46
|
+
}
|
|
47
|
+
params.port = std::stoi(argv[i]);
|
|
48
|
+
if (params.port <= 0 || params.port > 65535) {
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
51
|
+
} else if (arg == "-m" || arg == "--mem") {
|
|
52
|
+
if (++i >= argc) {
|
|
53
|
+
return false;
|
|
54
|
+
}
|
|
55
|
+
params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
|
|
56
|
+
} else if (arg == "-h" || arg == "--help") {
|
|
57
|
+
print_usage(argc, argv, params);
|
|
58
|
+
exit(0);
|
|
59
|
+
} else {
|
|
60
|
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
|
61
|
+
print_usage(argc, argv, params);
|
|
62
|
+
exit(0);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
return true;
|
|
66
|
+
}
|
|
67
|
+
|
|
13
68
|
static ggml_backend_t create_backend() {
|
|
14
69
|
ggml_backend_t backend = NULL;
|
|
15
70
|
#ifdef GGML_USE_CUDA
|
|
@@ -38,21 +93,25 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
|
|
38
93
|
#ifdef GGML_USE_CUDA
|
|
39
94
|
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
|
40
95
|
#else
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
96
|
+
#ifdef _WIN32
|
|
97
|
+
MEMORYSTATUSEX status;
|
|
98
|
+
status.dwLength = sizeof(status);
|
|
99
|
+
GlobalMemoryStatusEx(&status);
|
|
100
|
+
*total_mem = status.ullTotalPhys;
|
|
101
|
+
*free_mem = status.ullAvailPhys;
|
|
102
|
+
#else
|
|
103
|
+
long pages = sysconf(_SC_PHYS_PAGES);
|
|
104
|
+
long page_size = sysconf(_SC_PAGE_SIZE);
|
|
105
|
+
*total_mem = pages * page_size;
|
|
106
|
+
*free_mem = *total_mem;
|
|
107
|
+
#endif
|
|
44
108
|
#endif
|
|
45
109
|
}
|
|
46
110
|
|
|
47
111
|
int main(int argc, char * argv[]) {
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
}
|
|
52
|
-
const char * host = argv[1];
|
|
53
|
-
int port = std::stoi(argv[2]);
|
|
54
|
-
if (port <= 0 || port > 65535) {
|
|
55
|
-
fprintf(stderr, "Invalid port number: %d\n", port);
|
|
112
|
+
rpc_server_params params;
|
|
113
|
+
if (!rpc_server_params_parse(argc, argv, params)) {
|
|
114
|
+
fprintf(stderr, "Invalid parameters\n");
|
|
56
115
|
return 1;
|
|
57
116
|
}
|
|
58
117
|
ggml_backend_t backend = create_backend();
|
|
@@ -60,10 +119,15 @@ int main(int argc, char * argv[]) {
|
|
|
60
119
|
fprintf(stderr, "Failed to create backend\n");
|
|
61
120
|
return 1;
|
|
62
121
|
}
|
|
63
|
-
|
|
122
|
+
std::string endpoint = params.host + ":" + std::to_string(params.port);
|
|
64
123
|
size_t free_mem, total_mem;
|
|
65
|
-
|
|
66
|
-
|
|
124
|
+
if (params.backend_mem > 0) {
|
|
125
|
+
free_mem = params.backend_mem;
|
|
126
|
+
total_mem = params.backend_mem;
|
|
127
|
+
} else {
|
|
128
|
+
get_backend_memory(&free_mem, &total_mem);
|
|
129
|
+
}
|
|
130
|
+
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
|
67
131
|
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
|
68
132
|
ggml_backend_free(backend);
|
|
69
133
|
return 0;
|
|
@@ -102,7 +102,6 @@ struct slot_params {
|
|
|
102
102
|
bool stream = true;
|
|
103
103
|
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
|
|
104
104
|
|
|
105
|
-
uint32_t seed = -1; // RNG seed
|
|
106
105
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
107
106
|
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
|
108
107
|
int32_t n_predict = -1; // new tokens to predict
|
|
@@ -671,6 +670,13 @@ struct server_context {
|
|
|
671
670
|
model = nullptr;
|
|
672
671
|
}
|
|
673
672
|
|
|
673
|
+
// Clear any sampling context
|
|
674
|
+
for (server_slot & slot : slots) {
|
|
675
|
+
if (slot.ctx_sampling != nullptr) {
|
|
676
|
+
llama_sampling_free(slot.ctx_sampling);
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
|
|
674
680
|
llama_batch_free(batch);
|
|
675
681
|
}
|
|
676
682
|
|
|
@@ -1013,7 +1019,7 @@ struct server_context {
|
|
|
1013
1019
|
sampler_names.emplace_back(sampler_name);
|
|
1014
1020
|
}
|
|
1015
1021
|
}
|
|
1016
|
-
slot.sparams.samplers_sequence =
|
|
1022
|
+
slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
|
|
1017
1023
|
} else {
|
|
1018
1024
|
slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
|
|
1019
1025
|
}
|
|
@@ -1250,14 +1256,14 @@ struct server_context {
|
|
|
1250
1256
|
std::vector<std::string> samplers_sequence;
|
|
1251
1257
|
samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
|
|
1252
1258
|
for (const auto & sampler_type : slot.sparams.samplers_sequence) {
|
|
1253
|
-
samplers_sequence.emplace_back(
|
|
1259
|
+
samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
|
|
1254
1260
|
}
|
|
1255
1261
|
|
|
1256
1262
|
return json {
|
|
1257
1263
|
{"n_ctx", slot.n_ctx},
|
|
1258
1264
|
{"n_predict", slot.n_predict},
|
|
1259
1265
|
{"model", params.model_alias},
|
|
1260
|
-
{"seed", slot.
|
|
1266
|
+
{"seed", slot.sparams.seed},
|
|
1261
1267
|
{"temperature", slot.sparams.temp},
|
|
1262
1268
|
{"dynatemp_range", slot.sparams.dynatemp_range},
|
|
1263
1269
|
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
|
|
@@ -1975,8 +1981,7 @@ struct server_context {
|
|
|
1975
1981
|
slot.state = SLOT_STATE_PROCESSING;
|
|
1976
1982
|
slot.command = SLOT_COMMAND_NONE;
|
|
1977
1983
|
slot.release();
|
|
1978
|
-
slot.
|
|
1979
|
-
send_final_response(slot);
|
|
1984
|
+
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
|
|
1980
1985
|
continue;
|
|
1981
1986
|
}
|
|
1982
1987
|
} else {
|
|
@@ -2380,6 +2385,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
|
|
2380
2385
|
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
|
2381
2386
|
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
|
2382
2387
|
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
|
2388
|
+
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
|
2383
2389
|
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
|
|
2384
2390
|
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
|
2385
2391
|
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
|
|
@@ -2432,6 +2438,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|
|
2432
2438
|
break;
|
|
2433
2439
|
}
|
|
2434
2440
|
sparams.port = std::stoi(argv[i]);
|
|
2441
|
+
} else if (arg == "--rpc") {
|
|
2442
|
+
if (++i >= argc) {
|
|
2443
|
+
invalid_param = true;
|
|
2444
|
+
break;
|
|
2445
|
+
}
|
|
2446
|
+
params.rpc_servers = argv[i];
|
|
2435
2447
|
} else if (arg == "--host") {
|
|
2436
2448
|
if (++i >= argc) {
|
|
2437
2449
|
invalid_param = true;
|
|
@@ -2840,7 +2852,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|
|
2840
2852
|
invalid_param = true;
|
|
2841
2853
|
break;
|
|
2842
2854
|
}
|
|
2843
|
-
if (!
|
|
2855
|
+
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
|
2844
2856
|
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
|
2845
2857
|
invalid_param = true;
|
|
2846
2858
|
break;
|
|
@@ -3298,7 +3310,7 @@ int main(int argc, char ** argv) {
|
|
|
3298
3310
|
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
|
3299
3311
|
json request_data = json::parse(req.body);
|
|
3300
3312
|
std::string filename = request_data.at("filename");
|
|
3301
|
-
if (!
|
|
3313
|
+
if (!fs_validate_filename(filename)) {
|
|
3302
3314
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
|
3303
3315
|
return;
|
|
3304
3316
|
}
|
|
@@ -3328,7 +3340,7 @@ int main(int argc, char ** argv) {
|
|
|
3328
3340
|
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
|
3329
3341
|
json request_data = json::parse(req.body);
|
|
3330
3342
|
std::string filename = request_data.at("filename");
|
|
3331
|
-
if (!
|
|
3343
|
+
if (!fs_validate_filename(filename)) {
|
|
3332
3344
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
|
3333
3345
|
return;
|
|
3334
3346
|
}
|