@fugood/llama.node 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/CMakeLists.txt +9 -0
  2. package/README.md +1 -1
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +1 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/LlamaContext.cpp +2 -2
  23. package/src/TokenizeWorker.cpp +1 -1
  24. package/src/llama.cpp/CMakeLists.txt +82 -54
  25. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  26. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  27. package/src/llama.cpp/common/common.cpp +748 -754
  28. package/src/llama.cpp/common/common.h +49 -41
  29. package/src/llama.cpp/common/grammar-parser.cpp +10 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  31. package/src/llama.cpp/common/log.h +5 -5
  32. package/src/llama.cpp/common/sampling.cpp +92 -10
  33. package/src/llama.cpp/common/sampling.h +6 -1
  34. package/src/llama.cpp/common/train.cpp +2 -2
  35. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  36. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  37. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  38. package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
  39. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  40. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  42. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  43. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
  44. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
  45. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
  46. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  47. package/src/llama.cpp/examples/llava/clip.h +1 -1
  48. package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
  49. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  50. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  51. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  52. package/src/llama.cpp/examples/main/main.cpp +29 -17
  53. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  54. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  55. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  56. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  57. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  58. package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
  59. package/src/llama.cpp/examples/server/server.cpp +33 -25
  60. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  61. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  62. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  63. package/src/llama.cpp/ggml-backend.c +2 -3
  64. package/src/llama.cpp/ggml-common.h +0 -54
  65. package/src/llama.cpp/ggml-cuda.h +1 -0
  66. package/src/llama.cpp/ggml-impl.h +51 -0
  67. package/src/llama.cpp/ggml-kompute.cpp +13 -3
  68. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  69. package/src/llama.cpp/ggml-quants.c +3715 -2050
  70. package/src/llama.cpp/ggml-rpc.cpp +1155 -0
  71. package/src/llama.cpp/ggml-rpc.h +24 -0
  72. package/src/llama.cpp/ggml-sycl.cpp +119 -673
  73. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  74. package/src/llama.cpp/ggml-vulkan.cpp +203 -224
  75. package/src/llama.cpp/ggml.c +1208 -1483
  76. package/src/llama.cpp/ggml.h +71 -46
  77. package/src/llama.cpp/llama.cpp +1374 -938
  78. package/src/llama.cpp/llama.h +22 -6
  79. package/src/llama.cpp/requirements.txt +0 -2
  80. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
  82. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  83. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  84. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  85. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  86. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  87. package/src/llama.cpp/unicode-data.h +15 -12
  88. package/src/llama.cpp/unicode.cpp +89 -111
  89. package/src/llama.cpp/unicode.h +44 -12
  90. package/src/llama.cpp/build.zig +0 -172
  91. package/src/llama.cpp/ggml-mpi.c +0 -216
  92. package/src/llama.cpp/ggml-mpi.h +0 -39
  93. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
  94. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -44,9 +44,9 @@ static void write_logfile(
44
44
  return;
45
45
  }
46
46
 
47
- const std::string timestamp = get_sortable_timestamp();
47
+ const std::string timestamp = string_get_sortable_timestamp();
48
48
 
49
- const bool success = create_directory_with_parents(params.logdir);
49
+ const bool success = fs_create_directory_with_parents(params.logdir);
50
50
  if (!success) {
51
51
  fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
52
52
  __func__, params.logdir.c_str());
@@ -64,7 +64,7 @@ static void write_logfile(
64
64
  fprintf(logfile, "binary: main\n");
65
65
  char model_desc[128];
66
66
  llama_model_desc(model, model_desc, sizeof(model_desc));
67
- dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
67
+ yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
68
68
 
69
69
  fprintf(logfile, "\n");
70
70
  fprintf(logfile, "######################\n");
@@ -72,9 +72,9 @@ static void write_logfile(
72
72
  fprintf(logfile, "######################\n");
73
73
  fprintf(logfile, "\n");
74
74
 
75
- dump_vector_float_yaml(logfile, "logits", results.logits);
75
+ yaml_dump_vector_float(logfile, "logits", results.logits);
76
76
  fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
77
- dump_vector_float_yaml(logfile, "probs", results.probs);
77
+ yaml_dump_vector_float(logfile, "probs", results.probs);
78
78
 
79
79
  llama_dump_timing_info_yaml(logfile, ctx);
80
80
  fclose(logfile);
@@ -1425,7 +1425,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1425
1425
  // Use all tasks
1426
1426
  tasks.resize(n_task);
1427
1427
  printf("%s: reading tasks", __func__);
1428
- int n_dot = n_task/100;
1428
+ int n_dot = std::max((int) n_task/100, 1);
1429
1429
  int i = 0;
1430
1430
  for (auto& task : tasks) {
1431
1431
  ++i;
@@ -1675,7 +1675,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1675
1675
 
1676
1676
  llama_batch_free(batch);
1677
1677
 
1678
- if (n_done < 100) return;
1678
+ if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
1679
1679
 
1680
1680
  float p = 1.f*n_correct/n_done;
1681
1681
  float sigma = sqrt(p*(1-p)/(n_done-1));
@@ -2007,7 +2007,7 @@ int main(int argc, char ** argv) {
2007
2007
 
2008
2008
  std::mt19937 rng(params.seed);
2009
2009
  if (params.random_prompt) {
2010
- params.prompt = gpt_random_prompt(rng);
2010
+ params.prompt = string_random_prompt(rng);
2011
2011
  }
2012
2012
 
2013
2013
  llama_backend_init();
@@ -2035,7 +2035,7 @@ int main(int argc, char ** argv) {
2035
2035
  // print system information
2036
2036
  {
2037
2037
  fprintf(stderr, "\n");
2038
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
2038
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
2039
2039
  }
2040
2040
 
2041
2041
  struct results_perplexity results;
@@ -259,7 +259,7 @@ int main(int argc, char ** argv) {
259
259
  usage(argv[0]);
260
260
  }
261
261
  } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
262
- if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
262
+ if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
263
263
  usage(argv[0]);
264
264
  }
265
265
  } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
@@ -284,7 +284,7 @@ int main(int argc, char ** argv) {
284
284
  } else {
285
285
  usage(argv[0]);
286
286
  }
287
- } else if (strcmp(argv[arg_idx], "--keep-split")) {
287
+ } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
288
288
  params.keep_split = true;
289
289
  } else {
290
290
  usage(argv[0]);
@@ -11,7 +11,7 @@ struct retrieval_params {
11
11
  };
12
12
 
13
13
  static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
14
- gpt_print_usage(argc, argv, gpt_params);
14
+ gpt_params_print_usage(argc, argv, gpt_params);
15
15
  printf("retrieval options:\n");
16
16
  printf(" --context-file FNAME file containing context to embed.\n");
17
17
  printf(" specify multiple files by providing --context-file option multiple times.\n");
@@ -226,7 +226,7 @@ int main(int argc, char ** argv) {
226
226
  // print system information
227
227
  {
228
228
  fprintf(stderr, "\n");
229
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
229
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
230
230
  }
231
231
 
232
232
  // max batch size
@@ -0,0 +1,2 @@
1
+ add_executable(rpc-server rpc-server.cpp)
2
+ target_link_libraries(rpc-server PRIVATE ggml llama)
@@ -0,0 +1,134 @@
1
+ #ifdef GGML_USE_CUDA
2
+ #include "ggml-cuda.h"
3
+ #endif
4
+
5
+ #ifdef GGML_USE_METAL
6
+ #include "ggml-metal.h"
7
+ #endif
8
+
9
+ #include "ggml-rpc.h"
10
+ #ifdef _WIN32
11
+ # include <windows.h>
12
+ #else
13
+ # include <unistd.h>
14
+ #endif
15
+ #include <string>
16
+ #include <stdio.h>
17
+
18
+ struct rpc_server_params {
19
+ std::string host = "0.0.0.0";
20
+ int port = 50052;
21
+ size_t backend_mem = 0;
22
+ };
23
+
24
+ static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
25
+ fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
26
+ fprintf(stderr, "options:\n");
27
+ fprintf(stderr, " -h, --help show this help message and exit\n");
28
+ fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
29
+ fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
30
+ fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
31
+ fprintf(stderr, "\n");
32
+ }
33
+
34
+ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
35
+ std::string arg;
36
+ for (int i = 1; i < argc; i++) {
37
+ arg = argv[i];
38
+ if (arg == "-H" || arg == "--host") {
39
+ if (++i >= argc) {
40
+ return false;
41
+ }
42
+ params.host = argv[i];
43
+ } else if (arg == "-p" || arg == "--port") {
44
+ if (++i >= argc) {
45
+ return false;
46
+ }
47
+ params.port = std::stoi(argv[i]);
48
+ if (params.port <= 0 || params.port > 65535) {
49
+ return false;
50
+ }
51
+ } else if (arg == "-m" || arg == "--mem") {
52
+ if (++i >= argc) {
53
+ return false;
54
+ }
55
+ params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
56
+ } else if (arg == "-h" || arg == "--help") {
57
+ print_usage(argc, argv, params);
58
+ exit(0);
59
+ } else {
60
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
61
+ print_usage(argc, argv, params);
62
+ exit(0);
63
+ }
64
+ }
65
+ return true;
66
+ }
67
+
68
+ static ggml_backend_t create_backend() {
69
+ ggml_backend_t backend = NULL;
70
+ #ifdef GGML_USE_CUDA
71
+ fprintf(stderr, "%s: using CUDA backend\n", __func__);
72
+ backend = ggml_backend_cuda_init(0); // init device 0
73
+ if (!backend) {
74
+ fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
75
+ }
76
+ #elif GGML_USE_METAL
77
+ fprintf(stderr, "%s: using Metal backend\n", __func__);
78
+ backend = ggml_backend_metal_init();
79
+ if (!backend) {
80
+ fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
81
+ }
82
+ #endif
83
+
84
+ // if there aren't GPU Backends fallback to CPU backend
85
+ if (!backend) {
86
+ fprintf(stderr, "%s: using CPU backend\n", __func__);
87
+ backend = ggml_backend_cpu_init();
88
+ }
89
+ return backend;
90
+ }
91
+
92
+ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
93
+ #ifdef GGML_USE_CUDA
94
+ ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
95
+ #else
96
+ #ifdef _WIN32
97
+ MEMORYSTATUSEX status;
98
+ status.dwLength = sizeof(status);
99
+ GlobalMemoryStatusEx(&status);
100
+ *total_mem = status.ullTotalPhys;
101
+ *free_mem = status.ullAvailPhys;
102
+ #else
103
+ long pages = sysconf(_SC_PHYS_PAGES);
104
+ long page_size = sysconf(_SC_PAGE_SIZE);
105
+ *total_mem = pages * page_size;
106
+ *free_mem = *total_mem;
107
+ #endif
108
+ #endif
109
+ }
110
+
111
+ int main(int argc, char * argv[]) {
112
+ rpc_server_params params;
113
+ if (!rpc_server_params_parse(argc, argv, params)) {
114
+ fprintf(stderr, "Invalid parameters\n");
115
+ return 1;
116
+ }
117
+ ggml_backend_t backend = create_backend();
118
+ if (!backend) {
119
+ fprintf(stderr, "Failed to create backend\n");
120
+ return 1;
121
+ }
122
+ std::string endpoint = params.host + ":" + std::to_string(params.port);
123
+ size_t free_mem, total_mem;
124
+ if (params.backend_mem > 0) {
125
+ free_mem = params.backend_mem;
126
+ total_mem = params.backend_mem;
127
+ } else {
128
+ get_backend_memory(&free_mem, &total_mem);
129
+ }
130
+ printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
131
+ start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
132
+ ggml_backend_free(backend);
133
+ return 0;
134
+ }
@@ -102,7 +102,6 @@ struct slot_params {
102
102
  bool stream = true;
103
103
  bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
104
104
 
105
- uint32_t seed = -1; // RNG seed
106
105
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
107
106
  int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
108
107
  int32_t n_predict = -1; // new tokens to predict
@@ -651,9 +650,6 @@ struct server_context {
651
650
  std::string system_prompt;
652
651
  std::vector<llama_token> system_tokens;
653
652
 
654
- std::string name_user; // this should be the antiprompt
655
- std::string name_assistant;
656
-
657
653
  // slots / clients
658
654
  std::vector<server_slot> slots;
659
655
  json default_generation_settings_for_props;
@@ -673,6 +669,15 @@ struct server_context {
673
669
  llama_free_model(model);
674
670
  model = nullptr;
675
671
  }
672
+
673
+ // Clear any sampling context
674
+ for (server_slot & slot : slots) {
675
+ if (slot.ctx_sampling != nullptr) {
676
+ llama_sampling_free(slot.ctx_sampling);
677
+ }
678
+ }
679
+
680
+ llama_batch_free(batch);
676
681
  }
677
682
 
678
683
  bool load_model(const gpt_params & params_) {
@@ -1014,7 +1019,7 @@ struct server_context {
1014
1019
  sampler_names.emplace_back(sampler_name);
1015
1020
  }
1016
1021
  }
1017
- slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
1022
+ slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
1018
1023
  } else {
1019
1024
  slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
1020
1025
  }
@@ -1098,15 +1103,11 @@ struct server_context {
1098
1103
  system_need_update = false;
1099
1104
  }
1100
1105
 
1101
- void system_prompt_set(const json & sys_props) {
1102
- system_prompt = sys_props.value("prompt", "");
1103
- name_user = sys_props.value("anti_prompt", "");
1104
- name_assistant = sys_props.value("assistant_name", "");
1106
+ bool system_prompt_set(const std::string & sys_prompt) {
1107
+ system_prompt = sys_prompt;
1105
1108
 
1106
1109
  LOG_VERBOSE("system prompt process", {
1107
1110
  {"system_prompt", system_prompt},
1108
- {"name_user", name_user},
1109
- {"name_assistant", name_assistant},
1110
1111
  });
1111
1112
 
1112
1113
  // release all slots
@@ -1115,6 +1116,7 @@ struct server_context {
1115
1116
  }
1116
1117
 
1117
1118
  system_need_update = true;
1119
+ return true;
1118
1120
  }
1119
1121
 
1120
1122
  bool process_token(completion_token_output & result, server_slot & slot) {
@@ -1254,14 +1256,14 @@ struct server_context {
1254
1256
  std::vector<std::string> samplers_sequence;
1255
1257
  samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
1256
1258
  for (const auto & sampler_type : slot.sparams.samplers_sequence) {
1257
- samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
1259
+ samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
1258
1260
  }
1259
1261
 
1260
1262
  return json {
1261
1263
  {"n_ctx", slot.n_ctx},
1262
1264
  {"n_predict", slot.n_predict},
1263
1265
  {"model", params.model_alias},
1264
- {"seed", slot.params.seed},
1266
+ {"seed", slot.sparams.seed},
1265
1267
  {"temperature", slot.sparams.temp},
1266
1268
  {"dynatemp_range", slot.sparams.dynatemp_range},
1267
1269
  {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
@@ -1534,7 +1536,8 @@ struct server_context {
1534
1536
  }
1535
1537
 
1536
1538
  if (task.data.contains("system_prompt")) {
1537
- system_prompt_set(task.data.at("system_prompt"));
1539
+ std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
1540
+ system_prompt_set(sys_prompt);
1538
1541
 
1539
1542
  for (server_slot & slot : slots) {
1540
1543
  slot.n_past = 0;
@@ -1978,8 +1981,7 @@ struct server_context {
1978
1981
  slot.state = SLOT_STATE_PROCESSING;
1979
1982
  slot.command = SLOT_COMMAND_NONE;
1980
1983
  slot.release();
1981
- slot.print_timings();
1982
- send_final_response(slot);
1984
+ send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
1983
1985
  continue;
1984
1986
  }
1985
1987
  } else {
@@ -2270,10 +2272,10 @@ struct server_context {
2270
2272
 
2271
2273
  const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
2272
2274
  if (n_probs > 0) {
2273
- const size_t n_considered = slot.ctx_sampling->n_considered;
2275
+ const size_t n_valid = slot.ctx_sampling->n_valid;
2274
2276
 
2275
2277
  // Make sure at least n_probs top tokens are at the front of the vector:
2276
- if (slot.sparams.temp == 0.0f && n_probs > n_considered) {
2278
+ if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
2277
2279
  llama_sample_top_k(ctx, &cur_p, n_probs, 0);
2278
2280
  }
2279
2281
 
@@ -2289,7 +2291,7 @@ struct server_context {
2289
2291
  for (size_t i = 0; i < n_probs; ++i) {
2290
2292
  result.probs.push_back({
2291
2293
  cur_p.data[i].id,
2292
- i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
2294
+ i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
2293
2295
  });
2294
2296
  }
2295
2297
  }
@@ -2383,6 +2385,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
2383
2385
  printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
2384
2386
  printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
2385
2387
  printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
2388
+ printf(" --rpc SERVERS comma separated list of RPC servers\n");
2386
2389
  printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
2387
2390
  printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
2388
2391
  printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
@@ -2435,6 +2438,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
2435
2438
  break;
2436
2439
  }
2437
2440
  sparams.port = std::stoi(argv[i]);
2441
+ } else if (arg == "--rpc") {
2442
+ if (++i >= argc) {
2443
+ invalid_param = true;
2444
+ break;
2445
+ }
2446
+ params.rpc_servers = argv[i];
2438
2447
  } else if (arg == "--host") {
2439
2448
  if (++i >= argc) {
2440
2449
  invalid_param = true;
@@ -2843,7 +2852,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
2843
2852
  invalid_param = true;
2844
2853
  break;
2845
2854
  }
2846
- if (!parse_kv_override(argv[i], params.kv_overrides)) {
2855
+ if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
2847
2856
  fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
2848
2857
  invalid_param = true;
2849
2858
  break;
@@ -2918,7 +2927,7 @@ int main(int argc, char ** argv) {
2918
2927
  server_params_parse(argc, argv, sparams, params);
2919
2928
 
2920
2929
  if (!sparams.system_prompt.empty()) {
2921
- ctx_server.system_prompt_set(json::parse(sparams.system_prompt));
2930
+ ctx_server.system_prompt_set(sparams.system_prompt);
2922
2931
  }
2923
2932
 
2924
2933
  if (params.model_alias == "unknown") {
@@ -3301,7 +3310,7 @@ int main(int argc, char ** argv) {
3301
3310
  const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
3302
3311
  json request_data = json::parse(req.body);
3303
3312
  std::string filename = request_data.at("filename");
3304
- if (!validate_file_name(filename)) {
3313
+ if (!fs_validate_filename(filename)) {
3305
3314
  res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
3306
3315
  return;
3307
3316
  }
@@ -3331,7 +3340,7 @@ int main(int argc, char ** argv) {
3331
3340
  const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
3332
3341
  json request_data = json::parse(req.body);
3333
3342
  std::string filename = request_data.at("filename");
3334
- if (!validate_file_name(filename)) {
3343
+ if (!fs_validate_filename(filename)) {
3335
3344
  res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
3336
3345
  return;
3337
3346
  }
@@ -3407,8 +3416,7 @@ int main(int argc, char ** argv) {
3407
3416
  const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
3408
3417
  res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
3409
3418
  json data = {
3410
- { "user_name", ctx_server.name_user.c_str() },
3411
- { "assistant_name", ctx_server.name_assistant.c_str() },
3419
+ { "system_prompt", ctx_server.system_prompt.c_str() },
3412
3420
  { "default_generation_settings", ctx_server.default_generation_settings_for_props },
3413
3421
  { "total_slots", ctx_server.params.n_parallel }
3414
3422
  };
@@ -371,7 +371,7 @@ static json oaicompat_completion_params_parse(
371
371
  llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
372
372
  llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
373
373
  llama_params["stream"] = json_value(body, "stream", false);
374
- llama_params["temperature"] = json_value(body, "temperature", 0.0);
374
+ llama_params["temperature"] = json_value(body, "temperature", 1.0);
375
375
  llama_params["top_p"] = json_value(body, "top_p", 1.0);
376
376
 
377
377
  // Apply chat template to the list of messages