PyPI - sglang - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl - Mend

sglang 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

sglang/__init__.py +33 -26
sglang/api.py +9 -1
sglang/bench_latency.py +2 -2
sglang/bench_serving.py +10 -1
sglang/check_env.py +1 -1
sglang/lang/backend/litellm.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/backend/runtime_endpoint.py +4 -4
sglang/lang/interpreter.py +24 -9
sglang/lang/ir.py +1 -1
sglang/srt/constrained/__init__.py +15 -0
sglang/srt/constrained/base_cache.py +15 -0
sglang/srt/constrained/fsm_cache.py +36 -1
sglang/srt/constrained/jump_forward.py +15 -0
sglang/srt/conversation.py +26 -0
sglang/srt/hf_transformers_utils.py +18 -1
sglang/srt/layers/context_flashattention_nopad.py +15 -0
sglang/srt/layers/extend_attention.py +15 -0
sglang/srt/layers/fused_moe.py +15 -0
sglang/srt/layers/linear.py +15 -0
sglang/srt/layers/logits_processor.py +109 -72
sglang/srt/layers/quantization/__init__.py +15 -0
sglang/srt/layers/quantization/fp8.py +15 -0
sglang/srt/layers/radix_attention.py +21 -3
sglang/srt/layers/token_attention.py +16 -1
sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
sglang/srt/managers/detokenizer_manager.py +16 -1
sglang/srt/managers/io_struct.py +38 -5
sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +85 -25
sglang/srt/managers/tokenizer_manager.py +99 -57
sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +177 -81
sglang/srt/mem_cache/flush_cache.py +33 -0
sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
sglang/srt/{managers/controller → mem_cache}/radix_cache.py +15 -0
sglang/srt/mm_utils.py +15 -0
sglang/srt/model_config.py +20 -0
sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +42 -18
sglang/srt/{managers/controller → model_executor}/model_runner.py +51 -16
sglang/srt/model_loader/model_loader.py +15 -0
sglang/srt/model_loader/utils.py +16 -1
sglang/srt/models/chatglm.py +16 -1
sglang/srt/models/commandr.py +16 -1
sglang/srt/models/dbrx.py +16 -1
sglang/srt/models/deepseek.py +16 -1
sglang/srt/models/deepseek_v2.py +532 -0
sglang/srt/models/gemma.py +16 -1
sglang/srt/models/gemma2.py +16 -1
sglang/srt/models/gpt_bigcode.py +16 -1
sglang/srt/models/grok.py +16 -1
sglang/srt/models/internlm2.py +16 -1
sglang/srt/models/llama2.py +16 -1
sglang/srt/models/llama_classification.py +19 -4
sglang/srt/models/llava.py +17 -2
sglang/srt/models/llavavid.py +17 -2
sglang/srt/models/minicpm.py +16 -1
sglang/srt/models/mistral.py +15 -0
sglang/srt/models/mixtral.py +16 -1
sglang/srt/models/mixtral_quant.py +16 -1
sglang/srt/models/qwen.py +16 -1
sglang/srt/models/qwen2.py +16 -1
sglang/srt/models/qwen2_moe.py +16 -1
sglang/srt/models/stablelm.py +16 -1
sglang/srt/models/yivl.py +15 -0
sglang/srt/openai_api/adapter.py +545 -160
sglang/srt/openai_api/protocol.py +65 -1
sglang/srt/sampling_params.py +20 -4
sglang/srt/server.py +90 -37
sglang/srt/server_args.py +76 -17
sglang/srt/utils.py +15 -0
sglang/test/test_programs.py +5 -1
sglang/utils.py +22 -0
sglang/version.py +1 -1
{sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/METADATA +40 -12
sglang-0.2.7.dist-info/RECORD +93 -0
{sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/WHEEL +1 -1
sglang/srt/flush_cache.py +0 -18
sglang-0.2.5.dist-info/RECORD +0 -92
{sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/LICENSE +0 -0
{sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/top_level.txt +0 -0

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Pydantic models for OpenAI API protocol"""
 import time
@@ -45,6 +60,55 @@ class UsageInfo(BaseModel):
     completion_tokens: Optional[int] = 0
+class FileRequest(BaseModel):
+    # https://platform.openai.com/docs/api-reference/files/create
+    file: bytes  # The File object (not file name) to be uploaded
+    purpose: str = (
+        "batch"  # The intended purpose of the uploaded file, default is "batch"
+    )
+class FileResponse(BaseModel):
+    id: str
+    object: str = "file"
+    bytes: int
+    created_at: int
+    filename: str
+    purpose: str
+class BatchRequest(BaseModel):
+    input_file_id: (
+        str  # The ID of an uploaded file that contains requests for the new batch
+    )
+    endpoint: str  # The endpoint to be used for all requests in the batch
+    completion_window: str  # The time frame within which the batch should be processed
+    metadata: Optional[dict] = None  # Optional custom metadata for the batch
+class BatchResponse(BaseModel):
+    id: str
+    object: str = "batch"
+    endpoint: str
+    errors: Optional[dict] = None
+    input_file_id: str
+    completion_window: str
+    status: str = "validating"
+    output_file_id: Optional[str] = None
+    error_file_id: Optional[str] = None
+    created_at: int
+    in_progress_at: Optional[int] = None
+    expires_at: Optional[int] = None
+    finalizing_at: Optional[int] = None
+    completed_at: Optional[int] = None
+    failed_at: Optional[int] = None
+    expired_at: Optional[int] = None
+    cancelling_at: Optional[int] = None
+    cancelled_at: Optional[int] = None
+    request_counts: dict = {"total": 0, "completed": 0, "failed": 0}
+    metadata: Optional[dict] = None
 class CompletionRequest(BaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/completions/create
@@ -152,7 +216,7 @@ class ChatCompletionRequest(BaseModel):
     logit_bias: Optional[Dict[str, float]] = None
     logprobs: Optional[bool] = False
     top_logprobs: Optional[int] = None
-    max_tokens: Optional[int] = 16
+    max_tokens: Optional[int] = None
     n: Optional[int] = 1
     presence_penalty: Optional[float] = 0.0
     response_format: Optional[ResponseFormat] = None

sglang/srt/sampling_params.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Sampling parameters for text generation."""
 from typing import List, Optional, Union
@@ -65,10 +80,11 @@ class SamplingParams:
             raise ValueError(
                 "presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}."
             )
-        if self.max_new_tokens < 0:
-            raise ValueError(
-                f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
-            )
+        if self.max_new_tokens is not None:
+            if self.max_new_tokens < 0:
+                raise ValueError(
+                    f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
+                )
     def normalize(self, tokenizer):
         # Process stop strings

sglang/srt/server.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """
 The entry point of inference server.
 SRT = SGLang Runtime.
@@ -23,17 +38,17 @@ import psutil
 import requests
 import uvicorn
 import uvloop
-from fastapi import FastAPI, Request
+from fastapi import FastAPI, File, Form, Request, UploadFile
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.srt.constrained import disable_cache
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.managers.controller.manager_multi import (
+from sglang.srt.managers.controller_multi import (
     start_controller_process as start_controller_process_multi,
 )
-from sglang.srt.managers.controller.manager_single import launch_tp_servers
-from sglang.srt.managers.controller.manager_single import (
+from sglang.srt.managers.controller_single import launch_tp_servers
+from sglang.srt.managers.controller_single import (
     start_controller_process as start_controller_process_single,
 )
 from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
@@ -41,8 +56,13 @@ from sglang.srt.managers.io_struct import GenerateReqInput
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
 from sglang.srt.openai_api.adapter import (
     load_chat_template_for_openai_api,
+    v1_batches,
     v1_chat_completions,
     v1_completions,
+    v1_files_create,
+    v1_retrieve_batch,
+    v1_retrieve_file,
+    v1_retrieve_file_content,
 )
 from sglang.srt.openai_api.protocol import ModelCard, ModelList
 from sglang.srt.server_args import PortArgs, ServerArgs
@@ -65,9 +85,6 @@ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 app = FastAPI()
 tokenizer_manager = None
-# Put some args for easily access
-global_server_args_dict = {}
 @app.get("/health")
 async def health() -> Response:
@@ -140,6 +157,35 @@ async def openai_v1_chat_completions(raw_request: Request):
     return await v1_chat_completions(tokenizer_manager, raw_request)
+@app.post("/v1/files")
+async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
+    return await v1_files_create(
+        file, purpose, tokenizer_manager.server_args.file_storage_pth
+    )
+@app.post("/v1/batches")
+async def openai_v1_batches(raw_request: Request):
+    return await v1_batches(tokenizer_manager, raw_request)
+@app.get("/v1/batches/{batch_id}")
+async def retrieve_batch(batch_id: str):
+    return await v1_retrieve_batch(batch_id)
+@app.get("/v1/files/{file_id}")
+async def retrieve_file(file_id: str):
+    # https://platform.openai.com/docs/api-reference/files/retrieve
+    return await v1_retrieve_file(file_id)
+@app.get("/v1/files/{file_id}/content")
+async def retrieve_file_content(file_id: str):
+    # https://platform.openai.com/docs/api-reference/files/retrieve-contents
+    return await v1_retrieve_file_content(file_id)
 @app.get("/v1/models")
 def available_models():
     """Show available models."""
@@ -150,14 +196,6 @@ def available_models():
     return ModelList(data=model_cards)
-def _set_global_server_args(server_args: ServerArgs):
-    global global_server_args_dict
-    global_server_args_dict = {
-        "disable_flashinfer": server_args.disable_flashinfer,
-        "attention_reduce_in_fp32": server_args.attention_reduce_in_fp32,
-    }
 def _set_torch_compile_config():
     # The following configurations are for torch compile optimizations
     import torch._dynamo.config
@@ -171,11 +209,46 @@ def _set_torch_compile_config():
     torch._dynamo.config.accumulated_cache_size_limit = 256
+def set_envs_and_config(server_args: ServerArgs):
+    # Set global environments
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+    os.environ["NCCL_CUMEM_ENABLE"] = "0"
+    os.environ["NCCL_NVLS_ENABLE"] = "0"
+    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+    # Set ulimit
+    set_ulimit()
+    # Enable show time cost for debugging
+    if server_args.show_time_cost:
+        enable_show_time_cost()
+    # Disable disk cache
+    if server_args.disable_disk_cache:
+        disable_cache()
+    # Fix triton bugs
+    if server_args.tp_size * server_args.dp_size > 1:
+        # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
+        maybe_set_triton_cache_manager()
+    # Set torch compile config
+    if server_args.enable_torch_compile:
+        _set_torch_compile_config()
+    # Set global chat template
+    if server_args.chat_template:
+        # TODO: replace this with huggingface transformers template
+        load_chat_template_for_openai_api(server_args.chat_template)
 def launch_server(
     server_args: ServerArgs,
     model_overide_args: Optional[dict] = None,
     pipe_finish_writer: Optional[mp.connection.Connection] = None,
 ):
+    server_args.check_server_args()
     """Launch an HTTP server."""
     global tokenizer_manager
@@ -184,34 +257,16 @@ def launch_server(
         format="%(message)s",
     )
-    # Set global environments
-    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-    os.environ["NCCL_CUMEM_ENABLE"] = "0"
-    os.environ["NCCL_NVLS_ENABLE"] = "0"
-    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
-    set_ulimit()
-    if server_args.show_time_cost:
-        enable_show_time_cost()
-    if server_args.disable_disk_cache:
-        disable_cache()
     if not server_args.disable_flashinfer:
         assert_pkg_version(
             "flashinfer",
-            "0.1.1",
+            "0.1.2",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
         )
-    if server_args.tp_size * server_args.dp_size > 1:
-        # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
-        maybe_set_triton_cache_manager()
-    if server_args.chat_template:
-        # TODO: replace this with huggingface transformers template
-        load_chat_template_for_openai_api(server_args.chat_template)
-    if server_args.enable_torch_compile:
-        _set_torch_compile_config()
-    _set_global_server_args(server_args)
+    set_envs_and_config(server_args)
     # Allocate ports
     server_args.port, server_args.additional_ports = allocate_init_ports(
@@ -230,8 +285,6 @@ def launch_server(
     # Handle multi-node tensor parallelism
     if server_args.nnodes > 1:
-        assert server_args.dp_size == 1, "Multi-node dp is not supported."
         if server_args.node_rank != 0:
             tp_size_local = server_args.tp_size // server_args.nnodes
             gpu_ids = [

sglang/srt/server_args.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """The arguments of the server."""
 import argparse
@@ -28,7 +43,8 @@ class ServerArgs:
     mem_fraction_static: Optional[float] = None
     max_prefill_tokens: Optional[int] = None
     max_running_requests: Optional[int] = None
-    schedule_heuristic: str = "lpm"
+    max_num_reqs: Optional[int] = None
+    schedule_policy: str = "lpm"
     schedule_conservativeness: float = 1.0
     # Other runtime options
@@ -44,20 +60,25 @@ class ServerArgs:
     # Other
     api_key: str = ""
+    file_storage_pth: str = "SGlang_storage"
     # Data parallelism
     dp_size: int = 1
     load_balance_method: str = "round_robin"
+    # Chunked Prefill
+    chunked_prefill_size: Optional[int] = None
     # Optimization/debug options
     disable_flashinfer: bool = False
+    disable_flashinfer_sampling: bool = False
     disable_radix_cache: bool = False
     disable_regex_jump_forward: bool = False
     disable_cuda_graph: bool = False
     disable_disk_cache: bool = False
     enable_torch_compile: bool = False
-    attention_reduce_in_fp32: bool = False
     enable_p2p_check: bool = False
+    attention_reduce_in_fp32: bool = False
     efficient_weight_load: bool = False
     # Distributed args
@@ -70,15 +91,15 @@ class ServerArgs:
             self.tokenizer_path = self.model_path
         if self.mem_fraction_static is None:
             if self.tp_size >= 16:
-                self.mem_fraction_static = 0.80
+                self.mem_fraction_static = 0.79
             elif self.tp_size >= 8:
-                self.mem_fraction_static = 0.84
+                self.mem_fraction_static = 0.83
             elif self.tp_size >= 4:
-                self.mem_fraction_static = 0.86
+                self.mem_fraction_static = 0.85
             elif self.tp_size >= 2:
-                self.mem_fraction_static = 0.88
+                self.mem_fraction_static = 0.87
             else:
-                self.mem_fraction_static = 0.89
+                self.mem_fraction_static = 0.88
         if isinstance(self.additional_ports, int):
             self.additional_ports = [self.additional_ports]
         elif self.additional_ports is None:
@@ -174,6 +195,7 @@ class ServerArgs:
                 "gptq",
                 "marlin",
                 "gptq_marlin",
+                "awq_marlin",
                 "squeezellm",
                 "bitsandbytes",
             ],
@@ -204,11 +226,17 @@ class ServerArgs:
             help="The maximum number of running requests.",
         )
         parser.add_argument(
-            "--schedule-heuristic",
+            "--max-num-reqs",
+            type=int,
+            default=ServerArgs.max_num_reqs,
+            help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
+        )
+        parser.add_argument(
+            "--schedule-policy",
             type=str,
-            default=ServerArgs.schedule_heuristic,
+            default=ServerArgs.schedule_policy,
             choices=["lpm", "random", "fcfs", "dfs-weight"],
-            help="The scheduling heuristic.",
+            help="The scheduling policy of the requests.",
         )
         parser.add_argument(
             "--schedule-conservativeness",
@@ -262,6 +290,12 @@ class ServerArgs:
             default=ServerArgs.api_key,
             help="Set API key of the server.",
         )
+        parser.add_argument(
+            "--file-storage-pth",
+            type=str,
+            default=ServerArgs.file_storage_pth,
+            help="The path of the file storage in backend.",
+        )
         # Data parallelism
         parser.add_argument(
@@ -288,15 +322,28 @@ class ServerArgs:
             help="The nccl init address of multi-node server.",
         )
         parser.add_argument(
-            "--nnodes", type=int, default=1, help="The number of nodes."
+            "--nnodes", type=int, default=ServerArgs.nnodes, help="The number of nodes."
         )
         parser.add_argument("--node-rank", type=int, help="The node rank.")
+        # Chunked prefill
+        parser.add_argument(
+            "--chunked-prefill-size",
+            type=int,
+            default=ServerArgs.chunked_prefill_size,
+            help="The size of the chunked prefill.",
+        )
         # Optimization/debug options
         parser.add_argument(
             "--disable-flashinfer",
             action="store_true",
-            help="Disable flashinfer inference kernels.",
+            help="Disable flashinfer attention kernels.",
+        )
+        parser.add_argument(
+            "--disable-flashinfer-sampling",
+            action="store_true",
+            help="Disable flashinfer sampling kernels.",
         )
         parser.add_argument(
             "--disable-radix-cache",
@@ -324,15 +371,15 @@ class ServerArgs:
             help="Optimize the model with torch.compile, experimental feature.",
         )
         parser.add_argument(
-            "--attention-reduce-in-fp32",
+            "--enable-p2p-check",
             action="store_true",
-            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
-            "This only affects Triton attention kernels",
+            help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
         )
         parser.add_argument(
-            "--enable-p2p-check",
+            "--attention-reduce-in-fp32",
             action="store_true",
-            help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
+            help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
+            "This only affects Triton attention kernels",
         )
         parser.add_argument(
             "--efficient-weight-load",
@@ -357,6 +404,18 @@ class ServerArgs:
             f"disable_disk_cache={self.disable_disk_cache}, "
         )
+    def check_server_args(self):
+        assert (
+            self.tp_size % self.nnodes == 0
+        ), "tp_size must be divisible by number of nodes"
+        assert not (
+            self.dp_size > 1 and self.node_rank is not None
+        ), "multi-node data parallel is not supported"
+        assert not (
+            self.chunked_prefill_size is not None and self.disable_radix_cache
+        ), "chunked prefill is not supported with radix cache disabled currently"
 @dataclasses.dataclass
 class PortArgs:

sglang/srt/utils.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Common utilities."""
 import base64

sglang/test/test_programs.py CHANGED Viewed

@@ -118,7 +118,11 @@ def test_decode_json_regex():
             s += "}"
     ret = decode_json.run()
-    js_obj = json.loads(ret["json_output"])
+    try:
+        js_obj = json.loads(ret["json_output"])
+    except json.decoder.JSONDecodeError:
+        print(ret["json_output"])
+        raise
     assert isinstance(js_obj["name"], str)
     assert isinstance(js_obj["population"], int)

sglang/utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Common utilities."""
 import base64
+import importlib
 import json
 import logging
 import signal
@@ -261,3 +262,24 @@ def graceful_registry(sub_module_name):
             logger.info(f"{sub_module_name} recive sigterm")
     signal.signal(signal.SIGTERM, graceful_shutdown)
+class LazyImport:
+    def __init__(self, module_name, class_name):
+        self.module_name = module_name
+        self.class_name = class_name
+        self._module = None
+    def _load(self):
+        if self._module is None:
+            module = importlib.import_module(self.module_name)
+            self._module = getattr(module, self.class_name)
+        return self._module
+    def __getattr__(self, name):
+        module = self._load()
+        return getattr(module, name)
+    def __call__(self, *args, **kwargs):
+        module = self._load()
+        return module(*args, **kwargs)

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.5"
1	+ __version__ = "0.2.7"

sglang 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

sglang 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl