PyPI - llama-cpp-pydist - Versions diffs - 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl - Mend

llama-cpp-pydist 0.19.0py3-none-any.whl → 0.21.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (242) hide show

vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp CHANGED Viewed

@@ -33,6 +33,7 @@ int server_queue::post(server_task && task, bool front) {
     } else {
         queue_tasks.push_back(std::move(task));
     }
+    time_last_task = ggml_time_ms();
     condition_tasks.notify_one();
     return task_id;
 }
@@ -54,6 +55,7 @@ int server_queue::post(std::vector<server_task> && tasks, bool front) {
             queue_tasks.push_back(std::move(task));
         }
     }
+    time_last_task = ggml_time_ms();
     condition_tasks.notify_one();
     return 0;
 }
@@ -62,6 +64,7 @@ void server_queue::defer(server_task && task) {
     std::unique_lock<std::mutex> lock(mutex_tasks);
     QUE_DBG("defer task, id = %d\n", task.id);
     queue_tasks_deferred.push_back(std::move(task));
+    time_last_task = ggml_time_ms();
     condition_tasks.notify_one();
 }
@@ -71,31 +74,52 @@ int server_queue::get_new_id() {
     return new_id;
 }
-void server_queue::on_new_task(std::function<void(server_task &&)> callback) {
-    callback_new_task = std::move(callback);
-}
-void server_queue::on_update_slots(std::function<void(void)> callback) {
-    callback_update_slots = std::move(callback);
-}
 void server_queue::pop_deferred_task() {
     std::unique_lock<std::mutex> lock(mutex_tasks);
     if (!queue_tasks_deferred.empty()) {
         queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
         queue_tasks_deferred.pop_front();
     }
+    time_last_task = ggml_time_ms();
     condition_tasks.notify_one();
 }
+void server_queue::wait_until_no_sleep() {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    if (!sleeping) {
+        return;
+    } else {
+        if (!req_stop_sleeping) {
+            QUE_DBG("%s", "requesting to stop sleeping\n");
+            req_stop_sleeping = true;
+            condition_tasks.notify_one(); // only main thread is waiting on this
+        }
+        QUE_DBG("%s", "waiting until no sleep\n");
+        condition_tasks.wait(lock, [&]{
+            return !sleeping;
+        });
+    }
+}
 void server_queue::terminate() {
     std::unique_lock<std::mutex> lock(mutex_tasks);
     running = false;
     condition_tasks.notify_all();
 }
-void server_queue::start_loop() {
+void server_queue::start_loop(int64_t idle_sleep_ms) {
     running = true;
+    time_last_task = ggml_time_ms();
+    constexpr auto max_wait_time = std::chrono::seconds(1);
+    auto should_sleep = [&]() -> bool {
+        // caller must hold mutex_tasks
+        if (idle_sleep_ms < 0) {
+            return false;
+        }
+        int64_t now = ggml_time_ms();
+        return (now - time_last_task) >= idle_sleep_ms;
+    };
     while (true) {
         QUE_DBG("%s", "processing new tasks\n");
@@ -117,23 +141,53 @@ void server_queue::start_loop() {
             QUE_DBG("processing task, id = %d\n", task.id);
             callback_new_task(std::move(task));
         }
         // all tasks in the current loop is processed, slots data is now ready
         QUE_DBG("%s", "update slots\n");
+        // this will run the main inference process for all slots
         callback_update_slots();
+        {
+            // update_slots() may take a while to finish, we need to make sure it's not counted as idle
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            time_last_task = ggml_time_ms();
+        }
         QUE_DBG("%s", "waiting for new tasks\n");
-        {
+        while (true) {
             std::unique_lock<std::mutex> lock(mutex_tasks);
-            if (!running) {
-                QUE_DBG("%s", "terminate\n");
-                return;
+            if (!running || !queue_tasks.empty()) {
+                break; // go back to process new tasks or terminate
             }
-            if (queue_tasks.empty()) {
+            // no tasks, check for sleeping state
+            if (should_sleep()) {
+                QUE_INF("%s", "entering sleeping state\n");
+                sleeping = true;
+                callback_sleeping_state(true);
+                req_stop_sleeping = false;
+                // wait until we are requested to exit sleeping state
                 condition_tasks.wait(lock, [&]{
+                    return (!running || req_stop_sleeping);
+                });
+                if (!running) { // may changed during sleep
+                    break; // terminate
+                }
+                QUE_INF("%s", "exiting sleeping state\n");
+                req_stop_sleeping = false;
+                callback_sleeping_state(false);
+                sleeping = false;
+                time_last_task = ggml_time_ms();
+                condition_tasks.notify_all(); // notify wait_until_no_sleep()
+                break; // process new tasks
+            } else {
+                // wait for new tasks or timeout for checking sleeping condition
+                bool res = condition_tasks.wait_for(lock, max_wait_time, [&]{
                     return (!queue_tasks.empty() || !running);
                 });
+                if (res) {
+                    break; // new task arrived or terminate
+                }
+                // otherwise, loop again to check sleeping condition
             }
         }
     }
@@ -271,23 +325,25 @@ void server_response::terminate() {
 // server_response_reader
 //
-void server_response_reader::post_task(server_task && task) {
+void server_response_reader::post_task(server_task && task, bool front) {
     GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader");
+    task.index = 0;
     id_tasks.insert(task.id);
     states.push_back(task.create_state());
     queue_results.add_waiting_task_id(task.id);
-    queue_tasks.post(std::move(task));
+    queue_tasks.post(std::move(task), front);
 }
-void server_response_reader::post_tasks(std::vector<server_task> && tasks) {
+void server_response_reader::post_tasks(std::vector<server_task> && tasks, bool front) {
     GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader");
     id_tasks = server_task::get_list_id(tasks);
     states.reserve(tasks.size());
     for (size_t i = 0; i < tasks.size(); i++) {
+        tasks[i].index = i;
         states.push_back(tasks[i].create_state());
     }
     queue_results.add_waiting_tasks(tasks);
-    queue_tasks.post(std::move(tasks));
+    queue_tasks.post(std::move(tasks), front);
 }
 bool server_response_reader::has_next() const {
@@ -313,7 +369,7 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
             }
             if (!states.empty()) {
                 // update the generation state if needed
-                size_t idx = result->get_index();
+                const size_t idx = result->index;
                 GGML_ASSERT(idx < states.size());
                 result->update(states[idx]);
             }
@@ -329,6 +385,7 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
 server_response_reader::batch_response server_response_reader::wait_for_all(const std::function<bool()> & should_stop) {
     batch_response batch_res;
+    batch_res.results.clear();
     batch_res.results.resize(id_tasks.size());
     while (has_next()) {
         auto res = next(should_stop);
@@ -340,7 +397,7 @@ server_response_reader::batch_response server_response_reader::wait_for_all(cons
             batch_res.error = std::move(res);
             return batch_res;
         }
-        const size_t idx = res->get_index();
+        const size_t idx = res->index;
         GGML_ASSERT(idx < batch_res.results.size() && "index out of range");
         GGML_ASSERT(batch_res.results[idx] == nullptr && "duplicate result received");
         batch_res.results[idx] = std::move(res);

vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h CHANGED Viewed

@@ -5,6 +5,7 @@
 #include <condition_variable>
 #include <deque>
 #include <mutex>
+#include <vector>
 #include <unordered_set>
 // struct for managing server tasks
@@ -12,7 +13,10 @@
 struct server_queue {
 private:
     int id = 0;
-    bool running;
+    bool running  = false;
+    bool sleeping = false;
+    bool req_stop_sleeping = false;
+    int64_t time_last_task = 0;
     // queues
     std::deque<server_task> queue_tasks;
@@ -24,6 +28,7 @@ private:
     // callback functions
     std::function<void(server_task &&)> callback_new_task;
     std::function<void(void)>           callback_update_slots;
+    std::function<void(bool)>           callback_sleeping_state;
 public:
     // Add a new task to the end of the queue
@@ -38,15 +43,18 @@ public:
     // Get the next id for creating a new task
     int get_new_id();
-    // Register function to process a new task
-    void on_new_task(std::function<void(server_task &&)> callback);
-    // Register the function to be called when all slots data is ready to be processed
-    void on_update_slots(std::function<void(void)> callback);
     // Call when the state of one slot is changed, it will move one task from deferred to main queue
     void pop_deferred_task();
+    // if sleeping, request exiting sleep state and wait until it is done
+    // returns immediately if not sleeping
+    void wait_until_no_sleep();
+    bool is_sleeping() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        return sleeping;
+    }
     // end the start_loop routine
     void terminate();
@@ -56,8 +64,15 @@ public:
      * - Process the task (i.e. maybe copy data into slot)
      * - Check if multitask is finished
      * - Update all slots
+     *
+     * Sleeping procedure (disabled if idle_sleep_ms < 0):
+     * - If there is no task after idle_sleep_ms, enter sleeping state
+     * - Call callback_sleeping_state(true)
+     * - Wait until req_stop_sleeping is set to true
+     * - Call callback_sleeping_state(false)
+     * - Exit sleeping state
      */
-    void start_loop();
+    void start_loop(int64_t idle_sleep_ms = -1);
     // for metrics
     size_t queue_tasks_deferred_size() {
@@ -65,6 +80,27 @@ public:
         return queue_tasks_deferred.size();
     }
+    //
+    // Functions below are not thread-safe, must only be used before start_loop() is called
+    //
+    // Register function to process a new task
+    void on_new_task(std::function<void(server_task &&)> callback) {
+        callback_new_task = std::move(callback);
+    }
+    // Register the function to be called when all slots data is ready to be processed
+    void on_update_slots(std::function<void(void)> callback) {
+        callback_update_slots = std::move(callback);
+    }
+    // Register callback for sleeping state change
+    // note: when entering sleeping state, the callback is called AFTER sleeping is set to true
+    //       when leaving sleeping state, the callback is called BEFORE sleeping is set to false
+    void on_sleeping_state(std::function<void(bool)> callback) {
+        callback_sleeping_state = std::move(callback);
+    }
 private:
     void cleanup_pending_task(int id_target);
 };
@@ -138,8 +174,10 @@ struct server_response_reader {
     int get_new_id() {
         return queue_tasks.get_new_id();
     }
-    void post_task(server_task && task);
-    void post_tasks(std::vector<server_task> && tasks);
+    // if front = true, the task will be posted to the front of the queue (high priority)
+    void post_task(server_task && task, bool front = false);
+    void post_tasks(std::vector<server_task> && tasks, bool front = false);
     bool has_next() const;
     // return nullptr if should_stop() is true before receiving a result

vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp CHANGED Viewed

@@ -32,8 +32,8 @@ json task_params::to_json(bool only_metrics) const {
     }
     json lora = json::array();
-    for (size_t i = 0; i < this->lora.size(); ++i) {
-        lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
+    for (auto & it : this->lora) {
+        lora.push_back({{"id", it.first}, {"scale", it.second}});
     }
     if (only_metrics) {
@@ -78,6 +78,7 @@ json task_params::to_json(bool only_metrics) const {
             {"speculative.p_min",         speculative.p_min},
             {"timings_per_token",         timings_per_token},
             {"post_sampling_probs",       post_sampling_probs},
+            {"backend_sampling",          sampling.backend_sampling},
             {"lora",                      lora},
         };
     }
@@ -136,6 +137,7 @@ json task_params::to_json(bool only_metrics) const {
         {"speculative.p_min",         speculative.p_min},
         {"timings_per_token",         timings_per_token},
         {"post_sampling_probs",       post_sampling_probs},
+        {"backend_sampling",          sampling.backend_sampling},
         {"lora",                      lora},
     };
 }
@@ -145,12 +147,10 @@ json task_params::to_json(bool only_metrics) const {
 //
 task_params server_task::params_from_json_cmpl(
-        const llama_context * ctx,
+        const llama_vocab * vocab,
         const common_params & params_base,
+        const int n_ctx_slot,
         const json & data) {
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
     task_params params;
     // Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
@@ -206,6 +206,7 @@ task_params server_task::params_from_json_cmpl(
     params.sampling.seed               = json_value(data, "seed",                defaults.sampling.seed);
     params.sampling.n_probs            = json_value(data, "n_probs",             defaults.sampling.n_probs);
     params.sampling.min_keep           = json_value(data, "min_keep",            defaults.sampling.min_keep);
+    params.sampling.backend_sampling   = json_value(data, "backend_sampling",    defaults.sampling.backend_sampling);
     params.post_sampling_probs         = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
     params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
@@ -223,12 +224,12 @@ task_params server_task::params_from_json_cmpl(
     if (data.contains("lora")) {
         if (data.at("lora").is_array()) {
-            params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
+            params.lora = parse_lora_request(data.at("lora"));
         } else {
             throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
         }
     } else {
-        params.lora = params_base.lora_adapters;
+        params.lora = {};
     }
     // TODO: add more sanity checks for the input parameters
@@ -243,11 +244,11 @@ task_params server_task::params_from_json_cmpl(
     if (params.sampling.penalty_last_n == -1) {
         // note: should be the slot's context and not the full context, but it's ok
-        params.sampling.penalty_last_n = llama_n_ctx(ctx);
+        params.sampling.penalty_last_n = n_ctx_slot;
     }
     if (params.sampling.dry_penalty_last_n == -1) {
-        params.sampling.dry_penalty_last_n = llama_n_ctx(ctx);
+        params.sampling.dry_penalty_last_n = n_ctx_slot;
     }
     if (params.sampling.dry_base < 1.0f) {
@@ -1153,7 +1154,7 @@ json server_task_result_rerank::to_json() {
 json server_task_result_cmpl_partial::to_json_anthropic() {
     json events = json::array();
     bool first = (n_decoded == 1);
-    static bool text_block_started = false;
+    bool text_block_started = false;
     if (first) {
         text_block_started = false;
@@ -1324,6 +1325,30 @@ json server_task_result_slot_erase::to_json() {
     };
 }
+//
+// server_task_result_get_lora
+//
+json server_task_result_get_lora::to_json() {
+    json result = json::array();
+    for (size_t i = 0; i < loras.size(); ++i) {
+        auto & lora = loras[i];
+        json entry = {
+            {"id",            i},
+            {"path",          lora.info.path},
+            {"scale",         lora.info.scale},
+            {"task_name",     lora.info.task_name},
+            {"prompt_prefix", lora.info.prompt_prefix},
+        };
+        if (!lora.alora_invocation_tokens.empty()) {
+            entry["alora_invocation_string"] = lora.alora_invocation_string;
+            entry["alora_invocation_tokens"] = lora.alora_invocation_tokens;
+        }
+        result.push_back(std::move(entry));
+    }
+    return result;
+}
 //
 // server_task_result_apply_lora
 //

vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h CHANGED Viewed

@@ -6,6 +6,7 @@
 #include <string>
 #include <unordered_set>
 #include <list>
+#include <map>
 // TODO: prevent including the whole server-common.h as we only use server_tokens
 #include "server-common.h"
@@ -23,6 +24,7 @@ enum server_task_type {
     SERVER_TASK_TYPE_SLOT_SAVE,
     SERVER_TASK_TYPE_SLOT_RESTORE,
     SERVER_TASK_TYPE_SLOT_ERASE,
+    SERVER_TASK_TYPE_GET_LORA,
     SERVER_TASK_TYPE_SET_LORA,
 };
@@ -60,7 +62,7 @@ struct task_params {
     int64_t t_max_prompt_ms  = -1; // TODO: implement
     int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
-    std::vector<common_adapter_lora_info> lora;
+    std::map<int, float> lora; // mapping adapter ID -> scale
     std::vector<std::string> antiprompt;
     std::vector<std::string> response_fields;
@@ -105,8 +107,10 @@ struct task_result_state {
 };
 struct server_task {
-    int id    = -1; // to be filled by server_queue
-    int index = -1; // used when there are multiple prompts (batch request)
+    int id = -1; // to be filled by server_queue
+    // TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
+    size_t index = 0; // used when there are multiple prompts (batch request)
     // used by SERVER_TASK_TYPE_CANCEL
     int id_target = -1;
@@ -138,7 +142,7 @@ struct server_task {
     bool metrics_reset_bucket = false;
     // used by SERVER_TASK_TYPE_SET_LORA
-    std::vector<common_adapter_lora_info> set_lora;
+    std::map<int, float> set_lora; // mapping adapter ID -> scale
     server_task() = default;
@@ -149,9 +153,10 @@ struct server_task {
     }
     static task_params params_from_json_cmpl(
-            const llama_context * ctx,
-            const common_params & params_base,
-            const json & data);
+        const llama_vocab * vocab,
+        const common_params & params_base,
+        const int n_ctx_slot,
+        const json & data);
     // utility function
     static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
@@ -162,10 +167,9 @@ struct server_task {
         return ids;
     }
-    server_task create_child(int id_parent, int id_child, int idx) const {
+    server_task create_child(int id_parent, int id_child) const {
         server_task copy;
         copy.id        = id_child;
-        copy.index     = idx;
         copy.id_parent = id_parent;
         copy.params    = params;
         copy.type      = type;
@@ -212,6 +216,10 @@ struct result_prompt_progress {
 struct server_task_result {
     int id           = -1;
     int id_slot      = -1;
+    // TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
+    size_t index = 0; // to be used for batched tasks
     virtual bool is_error() {
         // only used by server_task_result_error
         return false;
@@ -220,9 +228,6 @@ struct server_task_result {
         // only used by server_task_result_cmpl_*
         return true;
     }
-    virtual int get_index() {
-        return -1;
-    }
     virtual void update(task_result_state &) {
         // only used by server_task_result_cmpl_*
     }
@@ -255,8 +260,6 @@ struct completion_token_output {
 };
 struct server_task_result_cmpl_final : server_task_result {
-    int index = 0;
     std::string content;
     llama_tokens tokens;
@@ -289,10 +292,6 @@ struct server_task_result_cmpl_final : server_task_result {
     std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
     bool is_updated = false;
-    virtual int get_index() override {
-        return index;
-    }
     virtual bool is_stop() override {
         return true; // in stream mode, final responses are considered stop
     }
@@ -318,8 +317,6 @@ struct server_task_result_cmpl_final : server_task_result {
 };
 struct server_task_result_cmpl_partial : server_task_result {
-    int index = 0;
     std::string  content;
     llama_tokens tokens;
@@ -340,10 +337,6 @@ struct server_task_result_cmpl_partial : server_task_result {
     std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
     bool is_updated = false;
-    virtual int get_index() override {
-        return index;
-    }
     virtual bool is_stop() override {
         return false; // in stream mode, partial responses are not considered stop
     }
@@ -365,7 +358,6 @@ struct server_task_result_cmpl_partial : server_task_result {
 };
 struct server_task_result_embd : server_task_result {
-    int index = 0;
     std::vector<std::vector<float>> embedding;
     int32_t n_tokens;
@@ -373,10 +365,6 @@ struct server_task_result_embd : server_task_result {
     // response formatting
     task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
-    virtual int get_index() override {
-        return index;
-    }
     virtual json to_json() override;
     json to_json_non_oaicompat();
@@ -385,20 +373,14 @@ struct server_task_result_embd : server_task_result {
 };
 struct server_task_result_rerank : server_task_result {
-    int index = 0;
     float score = -1e6;
     int32_t n_tokens;
-    virtual int get_index() override {
-        return index;
-    }
     virtual json to_json() override;
 };
 struct server_task_result_error : server_task_result {
-    int index = 0;
     error_type err_type = ERROR_TYPE_SERVER;
     std::string err_msg;
@@ -460,6 +442,17 @@ struct server_task_result_slot_erase : server_task_result {
     virtual json to_json() override;
 };
+struct server_task_result_get_lora : server_task_result {
+    struct lora {
+        common_adapter_lora_info info;
+        std::string  alora_invocation_string;
+        llama_tokens alora_invocation_tokens;
+    };
+    std::vector<lora> loras;
+    virtual json to_json() override;
+};
 struct server_task_result_apply_lora : server_task_result {
     virtual json to_json() override;
 };

vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp CHANGED Viewed

@@ -66,7 +66,7 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
     };
 }
-int main(int argc, char ** argv, char ** envp) {
+int main(int argc, char ** argv) {
     // own arguments required by this example
     common_params params;
@@ -119,14 +119,14 @@ int main(int argc, char ** argv, char ** envp) {
     //
     // register API routes
-    server_routes routes(params, ctx_server, [&ctx_http]() { return ctx_http.is_ready.load(); });
+    server_routes routes(params, ctx_server);
     bool is_router_server = params.model.path.empty();
     std::optional<server_models_routes> models_routes{};
     if (is_router_server) {
         // setup server instances manager
         try {
-            models_routes.emplace(params, argc, argv, envp);
+            models_routes.emplace(params, argc, argv);
         } catch (const std::exception & e) {
             LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
             return 1;
@@ -252,7 +252,7 @@ int main(int argc, char ** argv, char ** envp) {
             return 1;
         }
-        ctx_server.init();
+        routes.update_meta(ctx_server);
         ctx_http.is_ready.store(true);
         LOG_INF("%s: model loaded\n", __func__);
@@ -309,7 +309,11 @@ int main(int argc, char ** argv, char ** envp) {
         if (monitor_thread.joinable()) {
             monitor_thread.join();
         }
-        llama_memory_breakdown_print(ctx_server.get_llama_context());
+        auto * ll_ctx = ctx_server.get_llama_context();
+        if (ll_ctx != nullptr) {
+            llama_memory_breakdown_print(ll_ctx);
+        }
     }
     return 0;

llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

llama-cpp-pydist 0.19.0py3-none-any.whl → 0.21.0py3-none-any.whl