@fugood/llama.node 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/bin/darwin/arm64/default.metallib +0 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/default.metallib +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/LlamaContext.cpp +2 -2
  19. package/src/LoadSessionWorker.cpp +1 -0
  20. package/src/llama.cpp/CMakeLists.txt +72 -46
  21. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  22. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  23. package/src/llama.cpp/common/common.cpp +732 -752
  24. package/src/llama.cpp/common/common.h +47 -41
  25. package/src/llama.cpp/common/grammar-parser.cpp +1 -1
  26. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  27. package/src/llama.cpp/common/log.h +5 -5
  28. package/src/llama.cpp/common/sampling.cpp +89 -7
  29. package/src/llama.cpp/common/sampling.h +5 -0
  30. package/src/llama.cpp/common/train.cpp +2 -2
  31. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  32. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  33. package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
  34. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  35. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  36. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  37. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  38. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  39. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
  40. package/src/llama.cpp/examples/llava/clip.h +1 -1
  41. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  42. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  43. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  44. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  45. package/src/llama.cpp/examples/main/main.cpp +24 -16
  46. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  47. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  48. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  49. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  50. package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
  51. package/src/llama.cpp/examples/server/server.cpp +21 -9
  52. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  53. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  54. package/src/llama.cpp/ggml-backend.c +0 -1
  55. package/src/llama.cpp/ggml-common.h +0 -54
  56. package/src/llama.cpp/ggml-cuda.h +1 -0
  57. package/src/llama.cpp/ggml-impl.h +51 -0
  58. package/src/llama.cpp/ggml-kompute.cpp +4 -0
  59. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  60. package/src/llama.cpp/ggml-quants.c +3700 -2041
  61. package/src/llama.cpp/ggml-rpc.cpp +188 -56
  62. package/src/llama.cpp/ggml-sycl.cpp +99 -530
  63. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  64. package/src/llama.cpp/ggml-vulkan.cpp +202 -225
  65. package/src/llama.cpp/ggml.c +1034 -1154
  66. package/src/llama.cpp/ggml.h +59 -31
  67. package/src/llama.cpp/llama.cpp +859 -609
  68. package/src/llama.cpp/llama.h +19 -6
  69. package/src/llama.cpp/requirements.txt +0 -1
  70. package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
  71. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  72. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  73. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  74. package/src/llama.cpp/unicode-data.h +15 -12
  75. package/src/llama.cpp/unicode.cpp +89 -111
  76. package/src/llama.cpp/unicode.h +44 -12
  77. package/src/llama.cpp/build.zig +0 -172
  78. package/src/llama.cpp/ggml-mpi.c +0 -216
  79. package/src/llama.cpp/ggml-mpi.h +0 -39
  80. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
88
88
  // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
89
89
  static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
90
90
  struct {
91
- struct ggml_tensor * newline;
92
91
  struct ggml_context * ctx;
93
92
  } model;
94
93
 
@@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
150
149
 
151
150
  model.ctx = ggml_init(params);
152
151
 
153
- ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
154
- model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
155
- if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
156
- if (newline_tmp->buffer == NULL) {
157
- LOG_TEE("newline_tmp tensor buffer is NULL\n");
158
- }
159
- ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
160
- } else {
161
- model.newline->data = newline_tmp->data;
162
- if (model.newline->data == NULL) {
163
- LOG_TEE("newline_tmp tensor data is NULL\n");
164
- }
165
- }
166
-
167
152
  struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
168
153
  // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
169
154
  // fill it with the image embeddings, ignoring the base
@@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
174
174
  // debug
175
175
  if (dump_kv_cache) {
176
176
  llama_kv_cache_view_update(ctx, &kvc_view);
177
- dump_kv_cache_view_seqs(kvc_view, 40);
177
+ llama_kv_cache_dump_view_seqs(kvc_view, 40);
178
178
  }
179
179
 
180
180
  // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
@@ -121,7 +121,7 @@ int main(int argc, char ** argv){
121
121
  // debug
122
122
  if (dump_kv_cache) {
123
123
  llama_kv_cache_view_update(ctx, &kvc_view);
124
- dump_kv_cache_view_seqs(kvc_view, 40);
124
+ llama_kv_cache_dump_view_seqs(kvc_view, 40);
125
125
  }
126
126
 
127
127
  // print current draft sequence
@@ -60,9 +60,9 @@ static void write_logfile(
60
60
  return;
61
61
  }
62
62
 
63
- const std::string timestamp = get_sortable_timestamp();
63
+ const std::string timestamp = string_get_sortable_timestamp();
64
64
 
65
- const bool success = create_directory_with_parents(params.logdir);
65
+ const bool success = fs_create_directory_with_parents(params.logdir);
66
66
  if (!success) {
67
67
  fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
68
68
  __func__, params.logdir.c_str());
@@ -80,7 +80,7 @@ static void write_logfile(
80
80
  fprintf(logfile, "binary: main\n");
81
81
  char model_desc[128];
82
82
  llama_model_desc(model, model_desc, sizeof(model_desc));
83
- dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
83
+ yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
84
84
 
85
85
  fprintf(logfile, "\n");
86
86
  fprintf(logfile, "######################\n");
@@ -88,8 +88,8 @@ static void write_logfile(
88
88
  fprintf(logfile, "######################\n");
89
89
  fprintf(logfile, "\n");
90
90
 
91
- dump_string_yaml_multiline(logfile, "output", output.c_str());
92
- dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
91
+ yaml_dump_string_multiline(logfile, "output", output.c_str());
92
+ yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
93
93
 
94
94
  llama_dump_timing_info_yaml(logfile, ctx);
95
95
  fclose(logfile);
@@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
181
181
 
182
182
  std::mt19937 rng(params.seed);
183
183
  if (params.random_prompt) {
184
- params.prompt = gpt_random_prompt(rng);
184
+ params.prompt = string_random_prompt(rng);
185
185
  }
186
186
 
187
187
  LOG("%s: llama backend init\n", __func__);
@@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
219
219
  // print system information
220
220
  {
221
221
  LOG_TEE("\n");
222
- LOG_TEE("%s\n", get_system_info(params).c_str());
222
+ LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
223
223
  }
224
224
 
225
225
  std::string path_session = params.path_prompt_cache;
@@ -474,12 +474,12 @@ int main(int argc, char ** argv) {
474
474
  LOG_TEE("\n\n");
475
475
 
476
476
  if (params.interactive) {
477
- const char *control_message;
477
+ const char * control_message;
478
478
  if (params.multiline_input) {
479
- control_message = " - To return control to LLaMa, end your input with '\\'.\n"
479
+ control_message = " - To return control to the AI, end your input with '\\'.\n"
480
480
  " - To return control without starting a new line, end your input with '/'.\n";
481
481
  } else {
482
- control_message = " - Press Return to return control to LLaMa.\n"
482
+ control_message = " - Press Return to return control to the AI.\n"
483
483
  " - To return control without starting a new line, end your input with '/'.\n"
484
484
  " - If you want to submit another line, end your input with '\\'.\n";
485
485
  }
@@ -707,7 +707,7 @@ int main(int argc, char ** argv) {
707
707
 
708
708
  const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
709
709
 
710
- llama_sampling_accept(ctx_sampling, ctx, id, true);
710
+ llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
711
711
 
712
712
  LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
713
713
 
@@ -728,7 +728,7 @@ int main(int argc, char ** argv) {
728
728
 
729
729
  // push the prompt in the sampling context in order to apply repetition penalties later
730
730
  // for the prompt, we don't apply grammar rules
731
- llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
731
+ llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
732
732
 
733
733
  ++n_consumed;
734
734
  if ((int) embd.size() >= params.n_batch) {
@@ -740,18 +740,26 @@ int main(int argc, char ** argv) {
740
740
  // display text
741
741
  if (input_echo && display) {
742
742
  for (auto id : embd) {
743
- const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
744
- printf("%s", token_str.c_str());
743
+ const std::string token_str = llama_token_to_piece(ctx, id, params.special);
745
744
 
745
+ // Console/Stream Output
746
+ fprintf(stdout, "%s", token_str.c_str());
747
+
748
+ // Record Displayed Tokens To Log
749
+ // Note: Generated tokens are created one by one hence this check
746
750
  if (embd.size() > 1) {
751
+ // Incoming Requested Tokens
747
752
  input_tokens.push_back(id);
748
753
  } else {
754
+ // Outgoing Generated Tokens
749
755
  output_tokens.push_back(id);
750
756
  output_ss << token_str;
751
757
  }
758
+
759
+ fflush(stdout);
752
760
  }
753
- fflush(stdout);
754
761
  }
762
+
755
763
  // reset color to default if there is no pending user input
756
764
  if (input_echo && (int) embd_inp.size() == n_consumed) {
757
765
  console::set_display(console::reset);
@@ -879,7 +887,7 @@ int main(int argc, char ** argv) {
879
887
  embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
880
888
  }
881
889
  if (params.escape) {
882
- process_escapes(buffer);
890
+ string_process_escapes(buffer);
883
891
  }
884
892
 
885
893
  const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
@@ -210,7 +210,7 @@ int main(int argc, char ** argv) {
210
210
  while (true) {
211
211
  if (dump_kv_cache) {
212
212
  llama_kv_cache_view_update(ctx, &kvc_view);
213
- dump_kv_cache_view_seqs(kvc_view, 40);
213
+ llama_kv_cache_dump_view_seqs(kvc_view, 40);
214
214
  }
215
215
 
216
216
  llama_batch_clear(batch);
@@ -44,9 +44,9 @@ static void write_logfile(
44
44
  return;
45
45
  }
46
46
 
47
- const std::string timestamp = get_sortable_timestamp();
47
+ const std::string timestamp = string_get_sortable_timestamp();
48
48
 
49
- const bool success = create_directory_with_parents(params.logdir);
49
+ const bool success = fs_create_directory_with_parents(params.logdir);
50
50
  if (!success) {
51
51
  fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
52
52
  __func__, params.logdir.c_str());
@@ -64,7 +64,7 @@ static void write_logfile(
64
64
  fprintf(logfile, "binary: main\n");
65
65
  char model_desc[128];
66
66
  llama_model_desc(model, model_desc, sizeof(model_desc));
67
- dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
67
+ yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
68
68
 
69
69
  fprintf(logfile, "\n");
70
70
  fprintf(logfile, "######################\n");
@@ -72,9 +72,9 @@ static void write_logfile(
72
72
  fprintf(logfile, "######################\n");
73
73
  fprintf(logfile, "\n");
74
74
 
75
- dump_vector_float_yaml(logfile, "logits", results.logits);
75
+ yaml_dump_vector_float(logfile, "logits", results.logits);
76
76
  fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
77
- dump_vector_float_yaml(logfile, "probs", results.probs);
77
+ yaml_dump_vector_float(logfile, "probs", results.probs);
78
78
 
79
79
  llama_dump_timing_info_yaml(logfile, ctx);
80
80
  fclose(logfile);
@@ -1425,7 +1425,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1425
1425
  // Use all tasks
1426
1426
  tasks.resize(n_task);
1427
1427
  printf("%s: reading tasks", __func__);
1428
- int n_dot = n_task/100;
1428
+ int n_dot = std::max((int) n_task/100, 1);
1429
1429
  int i = 0;
1430
1430
  for (auto& task : tasks) {
1431
1431
  ++i;
@@ -1675,7 +1675,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1675
1675
 
1676
1676
  llama_batch_free(batch);
1677
1677
 
1678
- if (n_done < 100) return;
1678
+ if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
1679
1679
 
1680
1680
  float p = 1.f*n_correct/n_done;
1681
1681
  float sigma = sqrt(p*(1-p)/(n_done-1));
@@ -2007,7 +2007,7 @@ int main(int argc, char ** argv) {
2007
2007
 
2008
2008
  std::mt19937 rng(params.seed);
2009
2009
  if (params.random_prompt) {
2010
- params.prompt = gpt_random_prompt(rng);
2010
+ params.prompt = string_random_prompt(rng);
2011
2011
  }
2012
2012
 
2013
2013
  llama_backend_init();
@@ -2035,7 +2035,7 @@ int main(int argc, char ** argv) {
2035
2035
  // print system information
2036
2036
  {
2037
2037
  fprintf(stderr, "\n");
2038
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
2038
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
2039
2039
  }
2040
2040
 
2041
2041
  struct results_perplexity results;
@@ -259,7 +259,7 @@ int main(int argc, char ** argv) {
259
259
  usage(argv[0]);
260
260
  }
261
261
  } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
262
- if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
262
+ if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
263
263
  usage(argv[0]);
264
264
  }
265
265
  } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
@@ -284,7 +284,7 @@ int main(int argc, char ** argv) {
284
284
  } else {
285
285
  usage(argv[0]);
286
286
  }
287
- } else if (strcmp(argv[arg_idx], "--keep-split")) {
287
+ } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
288
288
  params.keep_split = true;
289
289
  } else {
290
290
  usage(argv[0]);
@@ -11,7 +11,7 @@ struct retrieval_params {
11
11
  };
12
12
 
13
13
  static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
14
- gpt_print_usage(argc, argv, gpt_params);
14
+ gpt_params_print_usage(argc, argv, gpt_params);
15
15
  printf("retrieval options:\n");
16
16
  printf(" --context-file FNAME file containing context to embed.\n");
17
17
  printf(" specify multiple files by providing --context-file option multiple times.\n");
@@ -226,7 +226,7 @@ int main(int argc, char ** argv) {
226
226
  // print system information
227
227
  {
228
228
  fprintf(stderr, "\n");
229
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
229
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
230
230
  }
231
231
 
232
232
  // max batch size
@@ -7,9 +7,64 @@
7
7
  #endif
8
8
 
9
9
  #include "ggml-rpc.h"
10
+ #ifdef _WIN32
11
+ # include <windows.h>
12
+ #else
13
+ # include <unistd.h>
14
+ #endif
10
15
  #include <string>
11
16
  #include <stdio.h>
12
17
 
18
+ struct rpc_server_params {
19
+ std::string host = "0.0.0.0";
20
+ int port = 50052;
21
+ size_t backend_mem = 0;
22
+ };
23
+
24
+ static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
25
+ fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
26
+ fprintf(stderr, "options:\n");
27
+ fprintf(stderr, " -h, --help show this help message and exit\n");
28
+ fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
29
+ fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
30
+ fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
31
+ fprintf(stderr, "\n");
32
+ }
33
+
34
+ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
35
+ std::string arg;
36
+ for (int i = 1; i < argc; i++) {
37
+ arg = argv[i];
38
+ if (arg == "-H" || arg == "--host") {
39
+ if (++i >= argc) {
40
+ return false;
41
+ }
42
+ params.host = argv[i];
43
+ } else if (arg == "-p" || arg == "--port") {
44
+ if (++i >= argc) {
45
+ return false;
46
+ }
47
+ params.port = std::stoi(argv[i]);
48
+ if (params.port <= 0 || params.port > 65535) {
49
+ return false;
50
+ }
51
+ } else if (arg == "-m" || arg == "--mem") {
52
+ if (++i >= argc) {
53
+ return false;
54
+ }
55
+ params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
56
+ } else if (arg == "-h" || arg == "--help") {
57
+ print_usage(argc, argv, params);
58
+ exit(0);
59
+ } else {
60
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
61
+ print_usage(argc, argv, params);
62
+ exit(0);
63
+ }
64
+ }
65
+ return true;
66
+ }
67
+
13
68
  static ggml_backend_t create_backend() {
14
69
  ggml_backend_t backend = NULL;
15
70
  #ifdef GGML_USE_CUDA
@@ -38,21 +93,25 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
38
93
  #ifdef GGML_USE_CUDA
39
94
  ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
40
95
  #else
41
- // TODO: implement for other backends
42
- *free_mem = 1;
43
- *total_mem = 1;
96
+ #ifdef _WIN32
97
+ MEMORYSTATUSEX status;
98
+ status.dwLength = sizeof(status);
99
+ GlobalMemoryStatusEx(&status);
100
+ *total_mem = status.ullTotalPhys;
101
+ *free_mem = status.ullAvailPhys;
102
+ #else
103
+ long pages = sysconf(_SC_PHYS_PAGES);
104
+ long page_size = sysconf(_SC_PAGE_SIZE);
105
+ *total_mem = pages * page_size;
106
+ *free_mem = *total_mem;
107
+ #endif
44
108
  #endif
45
109
  }
46
110
 
47
111
  int main(int argc, char * argv[]) {
48
- if (argc < 3) {
49
- fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
50
- return 1;
51
- }
52
- const char * host = argv[1];
53
- int port = std::stoi(argv[2]);
54
- if (port <= 0 || port > 65535) {
55
- fprintf(stderr, "Invalid port number: %d\n", port);
112
+ rpc_server_params params;
113
+ if (!rpc_server_params_parse(argc, argv, params)) {
114
+ fprintf(stderr, "Invalid parameters\n");
56
115
  return 1;
57
116
  }
58
117
  ggml_backend_t backend = create_backend();
@@ -60,10 +119,15 @@ int main(int argc, char * argv[]) {
60
119
  fprintf(stderr, "Failed to create backend\n");
61
120
  return 1;
62
121
  }
63
- printf("Starting RPC server on %s:%d\n", host, port);
122
+ std::string endpoint = params.host + ":" + std::to_string(params.port);
64
123
  size_t free_mem, total_mem;
65
- get_backend_memory(&free_mem, &total_mem);
66
- std::string endpoint = std::string(host) + ":" + std::to_string(port);
124
+ if (params.backend_mem > 0) {
125
+ free_mem = params.backend_mem;
126
+ total_mem = params.backend_mem;
127
+ } else {
128
+ get_backend_memory(&free_mem, &total_mem);
129
+ }
130
+ printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
67
131
  start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
68
132
  ggml_backend_free(backend);
69
133
  return 0;
@@ -102,7 +102,6 @@ struct slot_params {
102
102
  bool stream = true;
103
103
  bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
104
104
 
105
- uint32_t seed = -1; // RNG seed
106
105
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
107
106
  int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
108
107
  int32_t n_predict = -1; // new tokens to predict
@@ -671,6 +670,13 @@ struct server_context {
671
670
  model = nullptr;
672
671
  }
673
672
 
673
+ // Clear any sampling context
674
+ for (server_slot & slot : slots) {
675
+ if (slot.ctx_sampling != nullptr) {
676
+ llama_sampling_free(slot.ctx_sampling);
677
+ }
678
+ }
679
+
674
680
  llama_batch_free(batch);
675
681
  }
676
682
 
@@ -1013,7 +1019,7 @@ struct server_context {
1013
1019
  sampler_names.emplace_back(sampler_name);
1014
1020
  }
1015
1021
  }
1016
- slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
1022
+ slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
1017
1023
  } else {
1018
1024
  slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
1019
1025
  }
@@ -1250,14 +1256,14 @@ struct server_context {
1250
1256
  std::vector<std::string> samplers_sequence;
1251
1257
  samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
1252
1258
  for (const auto & sampler_type : slot.sparams.samplers_sequence) {
1253
- samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
1259
+ samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
1254
1260
  }
1255
1261
 
1256
1262
  return json {
1257
1263
  {"n_ctx", slot.n_ctx},
1258
1264
  {"n_predict", slot.n_predict},
1259
1265
  {"model", params.model_alias},
1260
- {"seed", slot.params.seed},
1266
+ {"seed", slot.sparams.seed},
1261
1267
  {"temperature", slot.sparams.temp},
1262
1268
  {"dynatemp_range", slot.sparams.dynatemp_range},
1263
1269
  {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
@@ -1975,8 +1981,7 @@ struct server_context {
1975
1981
  slot.state = SLOT_STATE_PROCESSING;
1976
1982
  slot.command = SLOT_COMMAND_NONE;
1977
1983
  slot.release();
1978
- slot.print_timings();
1979
- send_final_response(slot);
1984
+ send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
1980
1985
  continue;
1981
1986
  }
1982
1987
  } else {
@@ -2380,6 +2385,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
2380
2385
  printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
2381
2386
  printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
2382
2387
  printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
2388
+ printf(" --rpc SERVERS comma separated list of RPC servers\n");
2383
2389
  printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
2384
2390
  printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
2385
2391
  printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
@@ -2432,6 +2438,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
2432
2438
  break;
2433
2439
  }
2434
2440
  sparams.port = std::stoi(argv[i]);
2441
+ } else if (arg == "--rpc") {
2442
+ if (++i >= argc) {
2443
+ invalid_param = true;
2444
+ break;
2445
+ }
2446
+ params.rpc_servers = argv[i];
2435
2447
  } else if (arg == "--host") {
2436
2448
  if (++i >= argc) {
2437
2449
  invalid_param = true;
@@ -2840,7 +2852,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
2840
2852
  invalid_param = true;
2841
2853
  break;
2842
2854
  }
2843
- if (!parse_kv_override(argv[i], params.kv_overrides)) {
2855
+ if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
2844
2856
  fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
2845
2857
  invalid_param = true;
2846
2858
  break;
@@ -3298,7 +3310,7 @@ int main(int argc, char ** argv) {
3298
3310
  const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
3299
3311
  json request_data = json::parse(req.body);
3300
3312
  std::string filename = request_data.at("filename");
3301
- if (!validate_file_name(filename)) {
3313
+ if (!fs_validate_filename(filename)) {
3302
3314
  res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
3303
3315
  return;
3304
3316
  }
@@ -3328,7 +3340,7 @@ int main(int argc, char ** argv) {
3328
3340
  const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
3329
3341
  json request_data = json::parse(req.body);
3330
3342
  std::string filename = request_data.at("filename");
3331
- if (!validate_file_name(filename)) {
3343
+ if (!fs_validate_filename(filename)) {
3332
3344
  res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
3333
3345
  return;
3334
3346
  }