PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

sglang/srt/disaggregation/mini_lb.py CHANGED Viewed

@@ -50,6 +50,13 @@ class MiniLoadBalancer:
         self.prefill_servers = [p.url for p in prefill_configs]
         self.decode_servers = decode_servers
+    def add_prefill_server(self, new_prefill_config: PrefillConfig):
+        self.prefill_configs.append(new_prefill_config)
+        self.prefill_servers.append(new_prefill_config.url)
+    def add_decode_server(self, new_decode_server: str):
+        self.decode_servers.append(new_decode_server)
     def select_pair(self):
         # TODO: return some message instead of panic
         assert len(self.prefill_configs) > 0, "No prefill servers available"
@@ -73,11 +80,27 @@ class MiniLoadBalancer:
                 session.post(f"{prefill_server}/{endpoint}", json=modified_request),
                 session.post(f"{decode_server}/{endpoint}", json=modified_request),
             ]
             # Wait for both responses to complete. Prefill should end first.
-            _, decode_response = await asyncio.gather(*tasks)
+            prefill_response, decode_response = await asyncio.gather(*tasks)
+            if "return_logprob" in modified_request:
+                prefill_json = await prefill_response.json()
+                ret_json = await decode_response.json()
+                # merge `meta_info.input_token_logprobs` from prefill to decode
+                if "meta_info" in ret_json:
+                    if "input_token_logprobs" in ret_json["meta_info"]:
+                        ret_json["meta_info"]["input_token_logprobs"] = (
+                            prefill_json["meta_info"]["input_token_logprobs"]
+                            + ret_json["meta_info"]["input_token_logprobs"]
+                        )
+            else:
+                ret_json = await decode_response.json()
             return ORJSONResponse(
-                content=await decode_response.json(),
+                content=ret_json,
                 status_code=decode_response.status,
             )
@@ -92,30 +115,47 @@ class MiniLoadBalancer:
                     total=3600
                 )  # Add timeout for request reliability
             ) as session:
-                try:
-                    # Create the tasks for both prefill and decode requests
-                    tasks = [
-                        session.post(
-                            f"{prefill_server}/{endpoint}", json=modified_request
-                        ),
-                        session.post(
-                            f"{decode_server}/{endpoint}", json=modified_request
-                        ),
-                    ]
-                    # Wait for both responses to complete. Since this is streaming, they return immediately.
-                    prefill_response, decode_response = await asyncio.gather(*tasks)
+                # Create the tasks for both prefill and decode requests
+                tasks = [
+                    session.post(f"{prefill_server}/{endpoint}", json=modified_request),
+                    session.post(f"{decode_server}/{endpoint}", json=modified_request),
+                ]
+                # Wait for both responses to complete. Since this is streaming, they return immediately.
+                prefill_response, decode_response = await asyncio.gather(*tasks)
+                if modified_request.get("return_logprob", False):
+                    prefill_chunks = []
+                    async for chunk in prefill_response.content:
+                        prefill_chunks.append(chunk)
+                    first_prefill_chunk = (
+                        prefill_chunks[0].decode("utf-8")[5:].strip("\n")
+                    )
+                    first_prefill_chunk_json = orjson.loads(first_prefill_chunk)
+                    async for chunk in decode_response.content:
+                        # Note: This is inefficient
+                        # merge prefill input_token_logprobs, output_token_logprobs to decode
+                        decoded_chunk = chunk.decode("utf-8")
+                        if (
+                            decoded_chunk
+                            and decoded_chunk.startswith("data:")
+                            and "[DONE]" not in decoded_chunk
+                        ):
+                            ret_json = orjson.loads(decoded_chunk[5:].strip("\n"))
+                            ret_json["meta_info"]["input_token_logprobs"] = (
+                                first_prefill_chunk_json["meta_info"][
+                                    "input_token_logprobs"
+                                ]
+                                + ret_json["meta_info"]["input_token_logprobs"]
+                            )
+                            yield b"data: " + orjson.dumps(ret_json) + b"\n\n"
+                        else:
+                            yield chunk
+                else:
                     async for chunk in decode_response.content:
                         yield chunk
-                except Exception as e:
-                    error_msg = {
-                        "error": {"message": f"Stream processing error: {str(e)}"}
-                    }
-                    yield b"data: " + orjson.dumps(
-                        error_msg, option=orjson.OPT_NON_STR_KEYS
-                    ) + b"\n\n"
-                finally:
-                    if prefill_response is not None:
-                        await prefill_response.release()
         return StreamingResponse(
             stream_results(),
@@ -124,7 +164,7 @@ class MiniLoadBalancer:
 app = FastAPI()
-load_balancer = None
+load_balancer: Optional[MiniLoadBalancer] = None
 @app.get("/health")
@@ -234,8 +274,7 @@ async def handle_generate_request(request_data: dict):
         )
-@app.post("/v1/chat/completions")
-async def handle_completion_request(request_data: dict):
+async def _forward_to_backend(request_data: dict, endpoint_name: str):
     prefill_server, bootstrap_port, decode_server = load_balancer.select_pair()
     # Parse and transform prefill_server for bootstrap data
@@ -246,7 +285,7 @@ async def handle_completion_request(request_data: dict):
         {
             "bootstrap_host": hostname,
             "bootstrap_port": bootstrap_port,
-            "bootstrap_room": random.randint(0, 2**63 - 1),
+            "bootstrap_room": _generate_bootstrap_room(),
         }
     )
@@ -255,17 +294,27 @@ async def handle_completion_request(request_data: dict):
             modified_request,
             prefill_server,
             decode_server,
-            endpoint="v1/chat/completions",
+            endpoint=endpoint_name,
         )
     else:
         return await load_balancer.generate(
             modified_request,
             prefill_server,
             decode_server,
-            endpoint="v1/chat/completions",
+            endpoint=endpoint_name,
         )
+@app.post("/v1/chat/completions")
+async def handle_chat_completion_request(request_data: dict):
+    return await _forward_to_backend(request_data, "v1/chat/completions")
+@app.post("/v1/completions")
+async def handle_completion_request(request_data: dict):
+    return await _forward_to_backend(request_data, "v1/completions")
 def _generate_bootstrap_room():
     return random.randint(0, 2**63 - 1)
@@ -298,14 +347,14 @@ async def get_models():
 @app.post("/register")
 async def register(obj: PDRegistryRequest):
     if obj.mode == "prefill":
-        load_balancer.prefill_configs.append(
+        load_balancer.add_prefill_server(
             PrefillConfig(obj.registry_url, obj.bootstrap_port)
         )
         logger.info(
             f"Registered prefill server: {obj.registry_url} with bootstrap port: {obj.bootstrap_port}"
         )
     elif obj.mode == "decode":
-        load_balancer.decode_servers.append(obj.registry_url)
+        load_balancer.add_decode_server(obj.registry_url)
         logger.info(f"Registered decode server: {obj.registry_url}")
     else:
         raise HTTPException(
@@ -328,42 +377,7 @@ def run(prefill_configs, decode_addrs, host, port):
 if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Mini Load Balancer Server")
-    parser.add_argument(
-        "--prefill", type=str, default=[], nargs="+", help="URLs for prefill servers"
-    )
-    parser.add_argument(
-        "--decode", type=str, default=[], nargs="+", help="URLs for decode servers"
-    )
-    parser.add_argument(
-        "--prefill-bootstrap-ports",
-        type=int,
-        nargs="+",
-        help="Bootstrap ports for prefill servers",
-    )
-    parser.add_argument(
-        "--host", default="0.0.0.0", help="Host to bind the server (default: 0.0.0.0)"
-    )
-    parser.add_argument(
-        "--port", type=int, default=8000, help="Port to bind the server (default: 8000)"
-    )
-    args = parser.parse_args()
-    bootstrap_ports = args.prefill_bootstrap_ports
-    if bootstrap_ports is None:
-        bootstrap_ports = [None] * len(args.prefill)
-    elif len(bootstrap_ports) == 1:
-        bootstrap_ports = bootstrap_ports * len(args.prefill)
-    else:
-        if len(bootstrap_ports) != len(args.prefill):
-            raise ValueError(
-                "Number of prefill URLs must match number of bootstrap ports"
-            )
-    prefill_configs = [
-        PrefillConfig(url, port) for url, port in zip(args.prefill, bootstrap_ports)
-    ]
+    # FIXME: remove this, use the unified entry point: sglang.srt.disaggregation.launch_lb
+    from sglang.srt.disaggregation.launch_lb import main
-    run(prefill_configs, args.decode, args.host, args.port)
+    main()

sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl