npm - @fugood/llama.node - Versions diffs - 0.2.1 → 0.2.3 - Mend

@fugood/llama.node 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

package/bin/darwin/arm64/default.metallib +0 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/default.metallib +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/package.json +1 -1
package/src/LlamaContext.cpp +2 -2
package/src/LoadSessionWorker.cpp +1 -0
package/src/llama.cpp/CMakeLists.txt +72 -46
package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
package/src/llama.cpp/common/common.cpp +732 -752
package/src/llama.cpp/common/common.h +47 -41
package/src/llama.cpp/common/grammar-parser.cpp +1 -1
package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
package/src/llama.cpp/common/log.h +5 -5
package/src/llama.cpp/common/sampling.cpp +89 -7
package/src/llama.cpp/common/sampling.h +5 -0
package/src/llama.cpp/common/train.cpp +2 -2
package/src/llama.cpp/examples/batched/batched.cpp +1 -1
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
package/src/llama.cpp/examples/infill/infill.cpp +8 -8
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
package/src/llama.cpp/examples/llava/clip.h +1 -1
package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
package/src/llama.cpp/examples/llava/llava.cpp +0 -15
package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
package/src/llama.cpp/examples/main/main.cpp +24 -16
package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
package/src/llama.cpp/examples/server/server.cpp +21 -9
package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
package/src/llama.cpp/ggml-backend.c +0 -1
package/src/llama.cpp/ggml-common.h +0 -54
package/src/llama.cpp/ggml-cuda.h +1 -0
package/src/llama.cpp/ggml-impl.h +51 -0
package/src/llama.cpp/ggml-kompute.cpp +4 -0
package/src/llama.cpp/ggml-opencl.cpp +4 -1
package/src/llama.cpp/ggml-quants.c +3700 -2041
package/src/llama.cpp/ggml-rpc.cpp +188 -56
package/src/llama.cpp/ggml-sycl.cpp +99 -530
package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
package/src/llama.cpp/ggml-vulkan.cpp +202 -225
package/src/llama.cpp/ggml.c +1034 -1154
package/src/llama.cpp/ggml.h +59 -31
package/src/llama.cpp/llama.cpp +859 -609
package/src/llama.cpp/llama.h +19 -6
package/src/llama.cpp/requirements.txt +0 -1
package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
package/src/llama.cpp/tests/test-grad0.cpp +43 -83
package/src/llama.cpp/unicode-data.cpp +6969 -2169
package/src/llama.cpp/unicode-data.h +15 -12
package/src/llama.cpp/unicode.cpp +89 -111
package/src/llama.cpp/unicode.h +44 -12
package/src/llama.cpp/build.zig +0 -172
package/src/llama.cpp/ggml-mpi.c +0 -216
package/src/llama.cpp/ggml-mpi.h +0 -39
package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2

package/src/llama.cpp/examples/llava/llava.cpp CHANGED Viewed

@@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
 static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
     struct {
-        struct ggml_tensor * newline;
         struct ggml_context * ctx;
     } model;
@@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
     model.ctx = ggml_init(params);
-    ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
-    model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
-    if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
-        if (newline_tmp->buffer == NULL) {
-            LOG_TEE("newline_tmp tensor buffer is NULL\n");
-        }
-        ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
-    } else {
-        model.newline->data = newline_tmp->data;
-        if (model.newline->data == NULL) {
-            LOG_TEE("newline_tmp tensor data is NULL\n");
-        }
-    }
     struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
     // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
     // fill it with the image embeddings, ignoring the base

package/src/llama.cpp/examples/lookahead/lookahead.cpp CHANGED Viewed

@@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
         // debug
         if (dump_kv_cache) {
             llama_kv_cache_view_update(ctx, &kvc_view);
-            dump_kv_cache_view_seqs(kvc_view, 40);
+            llama_kv_cache_dump_view_seqs(kvc_view, 40);
         }
         // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/

package/src/llama.cpp/examples/lookup/lookup.cpp CHANGED Viewed

@@ -121,7 +121,7 @@ int main(int argc, char ** argv){
         // debug
         if (dump_kv_cache) {
             llama_kv_cache_view_update(ctx, &kvc_view);
-            dump_kv_cache_view_seqs(kvc_view, 40);
+            llama_kv_cache_dump_view_seqs(kvc_view, 40);
         }
         // print current draft sequence

package/src/llama.cpp/examples/main/main.cpp CHANGED Viewed

@@ -60,9 +60,9 @@ static void write_logfile(
         return;
     }
-    const std::string timestamp = get_sortable_timestamp();
+    const std::string timestamp = string_get_sortable_timestamp();
-    const bool success = create_directory_with_parents(params.logdir);
+    const bool success = fs_create_directory_with_parents(params.logdir);
     if (!success) {
         fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                 __func__, params.logdir.c_str());
@@ -80,7 +80,7 @@ static void write_logfile(
     fprintf(logfile, "binary: main\n");
     char model_desc[128];
     llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
+    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
     fprintf(logfile, "\n");
     fprintf(logfile, "######################\n");
@@ -88,8 +88,8 @@ static void write_logfile(
     fprintf(logfile, "######################\n");
     fprintf(logfile, "\n");
-    dump_string_yaml_multiline(logfile, "output", output.c_str());
-    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
+    yaml_dump_string_multiline(logfile, "output", output.c_str());
+    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
     llama_dump_timing_info_yaml(logfile, ctx);
     fclose(logfile);
@@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
     std::mt19937 rng(params.seed);
     if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
+        params.prompt = string_random_prompt(rng);
     }
     LOG("%s: llama backend init\n", __func__);
@@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
     // print system information
     {
         LOG_TEE("\n");
-        LOG_TEE("%s\n", get_system_info(params).c_str());
+        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
     }
     std::string path_session = params.path_prompt_cache;
@@ -474,12 +474,12 @@ int main(int argc, char ** argv) {
     LOG_TEE("\n\n");
     if (params.interactive) {
-        const char *control_message;
+        const char * control_message;
         if (params.multiline_input) {
-            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+            control_message = " - To return control to the AI, end your input with '\\'.\n"
                               " - To return control without starting a new line, end your input with '/'.\n";
         } else {
-            control_message = " - Press Return to return control to LLaMa.\n"
+            control_message = " - Press Return to return control to the AI.\n"
                               " - To return control without starting a new line, end your input with '/'.\n"
                               " - If you want to submit another line, end your input with '\\'.\n";
         }
@@ -707,7 +707,7 @@ int main(int argc, char ** argv) {
             const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
             LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
@@ -728,7 +728,7 @@ int main(int argc, char ** argv) {
                 // push the prompt in the sampling context in order to apply repetition penalties later
                 // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
+                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
@@ -740,18 +740,26 @@ int main(int argc, char ** argv) {
         // display text
         if (input_echo && display) {
             for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
-                printf("%s", token_str.c_str());
+                const std::string token_str = llama_token_to_piece(ctx, id, params.special);
+                // Console/Stream Output
+                fprintf(stdout, "%s", token_str.c_str());
+                // Record Displayed Tokens To Log
+                // Note: Generated tokens are created one by one hence this check
                 if (embd.size() > 1) {
+                    // Incoming Requested Tokens
                     input_tokens.push_back(id);
                 } else {
+                    // Outgoing Generated Tokens
                     output_tokens.push_back(id);
                     output_ss << token_str;
                 }
+                fflush(stdout);
             }
-            fflush(stdout);
         }
         // reset color to default if there is no pending user input
         if (input_echo && (int) embd_inp.size() == n_consumed) {
             console::set_display(console::reset);
@@ -879,7 +887,7 @@ int main(int argc, char ** argv) {
                         embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
                     }
                     if (params.escape) {
-                        process_escapes(buffer);
+                        string_process_escapes(buffer);
                     }
                     const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);

package/src/llama.cpp/examples/parallel/parallel.cpp CHANGED Viewed

@@ -210,7 +210,7 @@ int main(int argc, char ** argv) {
     while (true) {
         if (dump_kv_cache) {
             llama_kv_cache_view_update(ctx, &kvc_view);
-            dump_kv_cache_view_seqs(kvc_view, 40);
+            llama_kv_cache_dump_view_seqs(kvc_view, 40);
         }
         llama_batch_clear(batch);

package/src/llama.cpp/examples/perplexity/perplexity.cpp CHANGED Viewed

@@ -44,9 +44,9 @@ static void write_logfile(
         return;
     }
-    const std::string timestamp = get_sortable_timestamp();
+    const std::string timestamp = string_get_sortable_timestamp();
-    const bool success = create_directory_with_parents(params.logdir);
+    const bool success = fs_create_directory_with_parents(params.logdir);
     if (!success) {
         fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                 __func__, params.logdir.c_str());
@@ -64,7 +64,7 @@ static void write_logfile(
     fprintf(logfile, "binary: main\n");
     char model_desc[128];
     llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
+    yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
     fprintf(logfile, "\n");
     fprintf(logfile, "######################\n");
@@ -72,9 +72,9 @@ static void write_logfile(
     fprintf(logfile, "######################\n");
     fprintf(logfile, "\n");
-    dump_vector_float_yaml(logfile, "logits", results.logits);
+    yaml_dump_vector_float(logfile, "logits", results.logits);
     fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
-    dump_vector_float_yaml(logfile, "probs", results.probs);
+    yaml_dump_vector_float(logfile, "probs", results.probs);
     llama_dump_timing_info_yaml(logfile, ctx);
     fclose(logfile);
@@ -1425,7 +1425,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         // Use all tasks
         tasks.resize(n_task);
         printf("%s: reading tasks", __func__);
-        int n_dot = n_task/100;
+        int n_dot = std::max((int) n_task/100, 1);
         int i = 0;
         for (auto& task : tasks) {
             ++i;
@@ -1675,7 +1675,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
     llama_batch_free(batch);
-    if (n_done < 100) return;
+    if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
     float p = 1.f*n_correct/n_done;
     float sigma = sqrt(p*(1-p)/(n_done-1));
@@ -2007,7 +2007,7 @@ int main(int argc, char ** argv) {
     std::mt19937 rng(params.seed);
     if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
+        params.prompt = string_random_prompt(rng);
     }
     llama_backend_init();
@@ -2035,7 +2035,7 @@ int main(int argc, char ** argv) {
     // print system information
     {
         fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
     }
     struct results_perplexity results;

package/src/llama.cpp/examples/quantize/quantize.cpp CHANGED Viewed

@@ -259,7 +259,7 @@ int main(int argc, char ** argv) {
                 usage(argv[0]);
             }
         } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
-            if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
+            if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
                 usage(argv[0]);
             }
         } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
@@ -284,7 +284,7 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
-        } else if (strcmp(argv[arg_idx], "--keep-split")) {
+        } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
             params.keep_split = true;
         } else {
             usage(argv[0]);

package/src/llama.cpp/examples/retrieval/retrieval.cpp CHANGED Viewed

@@ -11,7 +11,7 @@ struct retrieval_params {
 };
 static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
-    gpt_print_usage(argc, argv, gpt_params);
+    gpt_params_print_usage(argc, argv, gpt_params);
     printf("retrieval options:\n");
     printf("  --context-file FNAME  file containing context to embed.\n");
     printf("                        specify multiple files by providing --context-file option multiple times.\n");
@@ -226,7 +226,7 @@ int main(int argc, char ** argv) {
     // print system information
     {
         fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
     }
     // max batch size

package/src/llama.cpp/examples/rpc/rpc-server.cpp CHANGED Viewed

@@ -7,9 +7,64 @@
 #endif
 #include "ggml-rpc.h"
+#ifdef _WIN32
+#  include <windows.h>
+#else
+#  include <unistd.h>
+#endif
 #include <string>
 #include <stdio.h>
+struct rpc_server_params {
+    std::string host        = "0.0.0.0";
+    int         port        = 50052;
+    size_t      backend_mem = 0;
+};
+static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
+    fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -H HOST, --host HOST  host to bind to (default: %s)\n", params.host.c_str());
+    fprintf(stderr, "  -p PORT, --port PORT  port to bind to (default: %d)\n", params.port);
+    fprintf(stderr, "  -m MEM, --mem MEM     backend memory size (in MB)\n");
+    fprintf(stderr, "\n");
+}
+static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg == "-H" || arg == "--host") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.host = argv[i];
+        } else if (arg == "-p" || arg == "--port") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.port = std::stoi(argv[i]);
+            if (params.port <= 0 || params.port > 65535) {
+                return false;
+            }
+        } else if (arg == "-m" || arg == "--mem") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+    return true;
+}
 static ggml_backend_t create_backend() {
     ggml_backend_t backend = NULL;
 #ifdef GGML_USE_CUDA
@@ -38,21 +93,25 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
 #ifdef GGML_USE_CUDA
     ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
 #else
-    // TODO: implement for other backends
-    *free_mem = 1;
-    *total_mem = 1;
+    #ifdef _WIN32
+        MEMORYSTATUSEX status;
+        status.dwLength = sizeof(status);
+        GlobalMemoryStatusEx(&status);
+        *total_mem = status.ullTotalPhys;
+        *free_mem = status.ullAvailPhys;
+    #else
+        long pages = sysconf(_SC_PHYS_PAGES);
+        long page_size = sysconf(_SC_PAGE_SIZE);
+        *total_mem = pages * page_size;
+        *free_mem = *total_mem;
+    #endif
 #endif
 }
 int main(int argc, char * argv[]) {
-    if (argc < 3) {
-        fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
-        return 1;
-    }
-    const char * host = argv[1];
-    int port = std::stoi(argv[2]);
-    if (port <= 0 || port > 65535) {
-        fprintf(stderr, "Invalid port number: %d\n", port);
+    rpc_server_params params;
+    if (!rpc_server_params_parse(argc, argv, params)) {
+        fprintf(stderr, "Invalid parameters\n");
         return 1;
     }
     ggml_backend_t backend = create_backend();
@@ -60,10 +119,15 @@ int main(int argc, char * argv[]) {
         fprintf(stderr, "Failed to create backend\n");
         return 1;
     }
-    printf("Starting RPC server on %s:%d\n", host, port);
+    std::string endpoint = params.host + ":" + std::to_string(params.port);
     size_t free_mem, total_mem;
-    get_backend_memory(&free_mem, &total_mem);
-    std::string endpoint = std::string(host) + ":" + std::to_string(port);
+    if (params.backend_mem > 0) {
+        free_mem = params.backend_mem;
+        total_mem = params.backend_mem;
+    } else {
+        get_backend_memory(&free_mem, &total_mem);
+    }
+    printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
     start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
     ggml_backend_free(backend);
     return 0;

package/src/llama.cpp/examples/server/server.cpp CHANGED Viewed

@@ -102,7 +102,6 @@ struct slot_params {
     bool stream       = true;
     bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
-    uint32_t seed      = -1; // RNG seed
     int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
     int32_t  n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
     int32_t  n_predict = -1; // new tokens to predict
@@ -671,6 +670,13 @@ struct server_context {
             model = nullptr;
         }
+        // Clear any sampling context
+        for (server_slot & slot : slots) {
+            if (slot.ctx_sampling != nullptr) {
+                llama_sampling_free(slot.ctx_sampling);
+            }
+        }
         llama_batch_free(batch);
     }
@@ -1013,7 +1019,7 @@ struct server_context {
                         sampler_names.emplace_back(sampler_name);
                     }
                 }
-                slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
+                slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
             } else {
                 slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
             }
@@ -1250,14 +1256,14 @@ struct server_context {
         std::vector<std::string> samplers_sequence;
         samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
         for (const auto & sampler_type : slot.sparams.samplers_sequence) {
-            samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
+            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
         }
         return json {
             {"n_ctx",                     slot.n_ctx},
             {"n_predict",                 slot.n_predict},
             {"model",                     params.model_alias},
-            {"seed",                      slot.params.seed},
+            {"seed",                      slot.sparams.seed},
             {"temperature",               slot.sparams.temp},
             {"dynatemp_range",            slot.sparams.dynatemp_range},
             {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
@@ -1975,8 +1981,7 @@ struct server_context {
                                 slot.state = SLOT_STATE_PROCESSING;
                                 slot.command = SLOT_COMMAND_NONE;
                                 slot.release();
-                                slot.print_timings();
-                                send_final_response(slot);
+                                send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
                                 continue;
                             }
                         } else {
@@ -2380,6 +2385,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
     printf("  --lora-base FNAME         optional model to use as a base for the layers modified by the LoRA adapter\n");
     printf("  --host                    ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
     printf("  --port PORT               port to listen (default  (default: %d)\n", sparams.port);
+    printf("  --rpc SERVERS             comma separated list of RPC servers\n");
     printf("  --path PUBLIC_PATH        path from which to serve static files (default: disabled)\n");
     printf("  --api-key API_KEY         optional api key to enhance server security. If set, requests must include this key for access.\n");
     printf("  --api-key-file FNAME      path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
@@ -2432,6 +2438,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                 break;
             }
             sparams.port = std::stoi(argv[i]);
+        } else if (arg == "--rpc") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rpc_servers = argv[i];
         } else if (arg == "--host") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2840,7 +2852,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                 invalid_param = true;
                 break;
             }
-            if (!parse_kv_override(argv[i], params.kv_overrides)) {
+            if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
                 fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
                 invalid_param = true;
                 break;
@@ -3298,7 +3310,7 @@ int main(int argc, char ** argv) {
     const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
         json request_data = json::parse(req.body);
         std::string filename = request_data.at("filename");
-        if (!validate_file_name(filename)) {
+        if (!fs_validate_filename(filename)) {
             res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
             return;
         }
@@ -3328,7 +3340,7 @@ int main(int argc, char ** argv) {
     const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
         json request_data = json::parse(req.body);
         std::string filename = request_data.at("filename");
-        if (!validate_file_name(filename)) {
+        if (!fs_validate_filename(filename)) {
             res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
             return;
         }