PyPI - sglang - Versions diffs - 0.3.6.post3__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

sglang 0.3.6.post3py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/bench_one_batch.py +4 -0
sglang/bench_serving.py +13 -0
sglang/check_env.py +1 -1
sglang/srt/_custom_ops.py +118 -0
sglang/srt/configs/device_config.py +17 -0
sglang/srt/configs/load_config.py +84 -0
sglang/srt/configs/model_config.py +161 -4
sglang/srt/configs/qwen2vl.py +5 -8
sglang/srt/constrained/outlines_backend.py +6 -1
sglang/srt/constrained/outlines_jump_forward.py +8 -1
sglang/srt/distributed/__init__.py +3 -0
sglang/srt/distributed/communication_op.py +34 -0
sglang/srt/distributed/device_communicators/__init__.py +0 -0
sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
sglang/srt/distributed/device_communicators/pynccl.py +204 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
sglang/srt/distributed/parallel_state.py +1275 -0
sglang/srt/distributed/utils.py +223 -0
sglang/srt/hf_transformers_utils.py +37 -1
sglang/srt/layers/attention/flashinfer_backend.py +13 -15
sglang/srt/layers/attention/torch_native_backend.py +285 -0
sglang/srt/layers/fused_moe_patch.py +20 -11
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/logits_processor.py +17 -3
sglang/srt/layers/quantization/__init__.py +34 -0
sglang/srt/layers/vocab_parallel_embedding.py +1 -0
sglang/srt/lora/lora.py +1 -1
sglang/srt/managers/io_struct.py +48 -2
sglang/srt/managers/schedule_batch.py +18 -14
sglang/srt/managers/schedule_policy.py +7 -4
sglang/srt/managers/scheduler.py +76 -20
sglang/srt/managers/tokenizer_manager.py +166 -68
sglang/srt/managers/tp_worker.py +36 -3
sglang/srt/managers/tp_worker_overlap_thread.py +21 -3
sglang/srt/model_executor/cuda_graph_runner.py +16 -7
sglang/srt/model_executor/forward_batch_info.py +9 -4
sglang/srt/model_executor/model_runner.py +136 -150
sglang/srt/model_loader/__init__.py +34 -0
sglang/srt/model_loader/loader.py +1139 -0
sglang/srt/model_loader/utils.py +41 -0
sglang/srt/model_loader/weight_utils.py +640 -0
sglang/srt/models/baichuan.py +9 -10
sglang/srt/models/chatglm.py +6 -15
sglang/srt/models/commandr.py +2 -3
sglang/srt/models/dbrx.py +2 -3
sglang/srt/models/deepseek.py +4 -11
sglang/srt/models/deepseek_v2.py +3 -11
sglang/srt/models/exaone.py +2 -3
sglang/srt/models/gemma.py +2 -6
sglang/srt/models/gemma2.py +3 -14
sglang/srt/models/gemma2_reward.py +0 -1
sglang/srt/models/gpt2.py +5 -12
sglang/srt/models/gpt_bigcode.py +6 -22
sglang/srt/models/grok.py +3 -3
sglang/srt/models/internlm2.py +2 -3
sglang/srt/models/internlm2_reward.py +0 -1
sglang/srt/models/llama.py +97 -27
sglang/srt/models/llama_classification.py +1 -2
sglang/srt/models/llama_embedding.py +1 -2
sglang/srt/models/llama_reward.py +2 -3
sglang/srt/models/llava.py +1 -4
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +4 -7
sglang/srt/models/minicpm3.py +6 -19
sglang/srt/models/mixtral.py +12 -5
sglang/srt/models/mixtral_quant.py +2 -3
sglang/srt/models/mllama.py +3 -7
sglang/srt/models/olmo.py +2 -8
sglang/srt/models/olmo2.py +0 -1
sglang/srt/models/olmoe.py +3 -5
sglang/srt/models/phi3_small.py +8 -8
sglang/srt/models/qwen.py +2 -3
sglang/srt/models/qwen2.py +10 -9
sglang/srt/models/qwen2_moe.py +4 -11
sglang/srt/models/qwen2_vl.py +2 -6
sglang/srt/models/registry.py +99 -0
sglang/srt/models/stablelm.py +2 -3
sglang/srt/models/torch_native_llama.py +6 -12
sglang/srt/models/xverse.py +2 -4
sglang/srt/models/xverse_moe.py +4 -11
sglang/srt/models/yivl.py +2 -3
sglang/srt/openai_api/adapter.py +9 -5
sglang/srt/openai_api/protocol.py +1 -0
sglang/srt/server.py +267 -170
sglang/srt/server_args.py +65 -31
sglang/srt/utils.py +245 -28
sglang/test/test_utils.py +7 -0
sglang/version.py +1 -1
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.dist-info}/METADATA +1 -1
sglang-0.4.0.dist-info/RECORD +184 -0
sglang-0.3.6.post3.dist-info/RECORD +0 -162
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.dist-info}/LICENSE +0 -0
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.dist-info}/WHEEL +0 -0
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.dist-info}/top_level.txt +0 -0

sglang/srt/server.py CHANGED Viewed

@@ -52,8 +52,11 @@ from sglang.srt.managers.io_struct import (
     CloseSessionReqInput,
     EmbeddingReqInput,
     GenerateReqInput,
+    GetWeightsByNameReqInput,
+    InitWeightsUpdateGroupReqInput,
     OpenSessionReqInput,
-    UpdateWeightReqInput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightsFromDistributedReqInput,
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
@@ -150,13 +153,11 @@ async def get_model_info():
 @app.get("/get_server_info")
 async def get_server_info():
-    try:
-        return await _get_server_info()
-    except Exception as e:
-        return ORJSONResponse(
-            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-        )
+    return {
+        **dataclasses.asdict(tokenizer_manager.server_args),  # server args
+        **scheduler_info,
+        "version": __version__,
+    }
 @app.post("/flush_cache")
@@ -192,11 +193,11 @@ async def stop_profile_async():
     )
-@app.post("/update_weights")
+@app.post("/update_weights_from_disk")
 @time_func_latency
-async def update_weights(obj: UpdateWeightReqInput, request: Request):
-    """Update the weights inplace without re-launching the server."""
-    success, message = await tokenizer_manager.update_weights(obj, request)
+async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request):
+    """Update the weights from disk inplace without re-launching the server."""
+    success, message = await tokenizer_manager.update_weights_from_disk(obj, request)
     content = {"success": success, "message": message}
     if success:
         return ORJSONResponse(
@@ -210,6 +211,52 @@ async def update_weights(obj: UpdateWeightReqInput, request: Request):
         )
+@app.post("/init_weights_update_group")
+async def init_weights_update_group(
+    obj: InitWeightsUpdateGroupReqInput, request: Request
+):
+    """Initialize the parameter update group."""
+    success, message = await tokenizer_manager.init_weights_update_group(obj, request)
+    content = {"success": success, "message": message}
+    if success:
+        return ORJSONResponse(content, status_code=200)
+    else:
+        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
+@app.post("/update_weights_from_distributed")
+async def update_weights_from_distributed(
+    obj: UpdateWeightsFromDistributedReqInput, request: Request
+):
+    """Update model parameter from distributed online."""
+    success, message = await tokenizer_manager.update_weights_from_distributed(
+        obj, request
+    )
+    content = {"success": success, "message": message}
+    if success:
+        return ORJSONResponse(content, status_code=200)
+    else:
+        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
+@app.api_route("/get_weights_by_name", methods=["GET", "POST"])
+async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
+    """Get model parameter by name."""
+    try:
+        ret = await tokenizer_manager.get_weights_by_name(obj, request)
+        if ret is None:
+            return ORJSONResponse(
+                {"error": {"message": "Get parameter by name failed"}},
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
+        else:
+            return ORJSONResponse(ret, status_code=200)
+    except Exception as e:
+        return ORJSONResponse(
+            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
+        )
 @app.api_route("/open_session", methods=["GET", "POST"])
 async def open_session(obj: OpenSessionReqInput, request: Request):
     """Open a session, and return its unique session id."""
@@ -517,14 +564,6 @@ def launch_server(
         t.join()
-async def _get_server_info():
-    return {
-        **dataclasses.asdict(tokenizer_manager.server_args),  # server args
-        **scheduler_info,
-        "version": __version__,
-    }
 def _set_envs_and_config(server_args: ServerArgs):
     # Set global environments
     os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@@ -637,11 +676,218 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
         delete_directory(server_args.model_path)
+STREAM_END_SYMBOL = b"data: [DONE]"
+STREAM_CHUNK_START_SYMBOL = b"data:"
+class Engine:
+    """
+    SRT Engine without an HTTP server layer.
+    This class provides a direct inference engine without the need for an HTTP server. It is designed for use cases where
+    launching the HTTP server adds unnecessary complexity or overhead,
+    """
+    def __init__(self, log_level: str = "error", *args, **kwargs):
+        """See the arguments in server_args.py::ServerArgs"""
+        # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
+        atexit.register(self.shutdown)
+        server_args = ServerArgs(*args, log_level=log_level, **kwargs)
+        launch_engine(server_args=server_args)
+    def generate(
+        self,
+        # The input prompt. It can be a single prompt or a batch of prompts.
+        prompt: Optional[Union[List[str], str]] = None,
+        sampling_params: Optional[Union[List[Dict], Dict]] = None,
+        # The token ids for text; one can either specify text or input_ids.
+        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+        stream: bool = False,
+    ):
+        obj = GenerateReqInput(
+            text=prompt,
+            input_ids=input_ids,
+            sampling_params=sampling_params,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            lora_path=lora_path,
+            stream=stream,
+        )
+        # get the current event loop
+        loop = asyncio.get_event_loop()
+        ret = loop.run_until_complete(generate_request(obj, None))
+        if stream is True:
+            def generator_wrapper():
+                offset = 0
+                loop = asyncio.get_event_loop()
+                generator = ret.body_iterator
+                while True:
+                    chunk = loop.run_until_complete(generator.__anext__())
+                    if chunk.startswith(STREAM_END_SYMBOL):
+                        break
+                    else:
+                        data = json.loads(chunk[len(STREAM_CHUNK_START_SYMBOL) :])
+                        data["text"] = data["text"][offset:]
+                        offset += len(data["text"])
+                        yield data
+            # we cannot yield in the scope of generate() because python does not allow yield + return in the same function
+            # however, it allows to wrap the generator as a subfunction and return
+            return generator_wrapper()
+        else:
+            return ret
+    async def async_generate(
+        self,
+        # The input prompt. It can be a single prompt or a batch of prompts.
+        prompt: Optional[Union[List[str], str]] = None,
+        sampling_params: Optional[Dict] = None,
+        # The token ids for text; one can either specify text or input_ids.
+        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+        stream: bool = False,
+    ):
+        obj = GenerateReqInput(
+            text=prompt,
+            input_ids=input_ids,
+            sampling_params=sampling_params,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            lora_path=lora_path,
+            stream=stream,
+        )
+        ret = await generate_request(obj, None)
+        if stream is True:
+            generator = ret.body_iterator
+            async def generator_wrapper():
+                offset = 0
+                while True:
+                    chunk = await generator.__anext__()
+                    if chunk.startswith(STREAM_END_SYMBOL):
+                        break
+                    else:
+                        data = json.loads(chunk[len(STREAM_CHUNK_START_SYMBOL) :])
+                        data["text"] = data["text"][offset:]
+                        offset += len(data["text"])
+                        yield data
+            return generator_wrapper()
+        else:
+            return ret
+    def shutdown(self):
+        kill_process_tree(os.getpid(), include_parent=False)
+    def get_tokenizer(self):
+        global tokenizer_manager
+        if tokenizer_manager is None:
+            raise ReferenceError("Tokenizer Manager is not initialized.")
+        else:
+            return tokenizer_manager.tokenizer
+    def encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+    ):
+        obj = EmbeddingReqInput(text=prompt)
+        # get the current event loop
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(encode_request(obj, None))
+    def start_profile(self):
+        tokenizer_manager.start_profile()
+    def stop_profile(self):
+        tokenizer_manager.stop_profile()
+    def get_server_info(self):
+        return {
+            **dataclasses.asdict(tokenizer_manager.server_args),  # server args
+            **scheduler_info,
+            "version": __version__,
+        }
+    def init_weights_update_group(
+        self,
+        master_address: str,
+        master_port: int,
+        rank_offset: int,
+        world_size: int,
+        group_name: str,
+        backend: str = "nccl",
+    ):
+        """Initialize parameter update group."""
+        obj = InitWeightsUpdateGroupReqInput(
+            master_address=master_address,
+            master_port=master_port,
+            rank_offset=rank_offset,
+            world_size=world_size,
+            group_name=group_name,
+            backend=backend,
+        )
+        async def _init_group():
+            return await tokenizer_manager.init_weights_update_group(obj, None)
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(_init_group())
+    def update_weights_from_distributed(self, name, dtype, shape):
+        """Update weights from distributed source."""
+        obj = UpdateWeightsFromDistributedReqInput(
+            name=name,
+            dtype=dtype,
+            shape=shape,
+        )
+        async def _update_weights():
+            return await tokenizer_manager.update_weights_from_distributed(obj, None)
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(_update_weights())
+    def get_weights_by_name(self, name, truncate_size=100):
+        """Get weights by parameter name."""
+        obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
+        async def _get_weights():
+            return await tokenizer_manager.get_weights_by_name(obj, None)
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(_get_weights())
 class Runtime:
     """
-    A wrapper for the server.
+    A wrapper for the HTTP server.
     This is used for launching the server in a python program without
     using the commond line interface.
+    It is mainly used for the frontend language.
+    You should use the Engine class if you want to do normal offline processing.
     """
     def __init__(
@@ -789,152 +1035,3 @@ class Runtime:
     def __del__(self):
         self.shutdown()
-STREAM_END_SYMBOL = b"data: [DONE]"
-STREAM_CHUNK_START_SYMBOL = b"data:"
-class Engine:
-    """
-    SRT Engine without an HTTP server layer.
-    This class provides a direct inference engine without the need for an HTTP server. It is designed for use cases where
-    launching the HTTP server adds unnecessary complexity or overhead,
-    """
-    def __init__(self, log_level: str = "error", *args, **kwargs):
-        # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
-        atexit.register(self.shutdown)
-        server_args = ServerArgs(*args, log_level=log_level, **kwargs)
-        launch_engine(server_args=server_args)
-    def generate(
-        self,
-        # The input prompt. It can be a single prompt or a batch of prompts.
-        prompt: Optional[Union[List[str], str]] = None,
-        sampling_params: Optional[Union[List[Dict], Dict]] = None,
-        # The token ids for text; one can either specify text or input_ids.
-        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
-        return_logprob: Optional[Union[List[bool], bool]] = False,
-        logprob_start_len: Optional[Union[List[int], int]] = None,
-        top_logprobs_num: Optional[Union[List[int], int]] = None,
-        lora_path: Optional[List[Optional[str]]] = None,
-        stream: bool = False,
-    ):
-        obj = GenerateReqInput(
-            text=prompt,
-            input_ids=input_ids,
-            sampling_params=sampling_params,
-            return_logprob=return_logprob,
-            logprob_start_len=logprob_start_len,
-            top_logprobs_num=top_logprobs_num,
-            lora_path=lora_path,
-            stream=stream,
-        )
-        # get the current event loop
-        loop = asyncio.get_event_loop()
-        ret = loop.run_until_complete(generate_request(obj, None))
-        if stream is True:
-            def generator_wrapper():
-                offset = 0
-                loop = asyncio.get_event_loop()
-                generator = ret.body_iterator
-                while True:
-                    chunk = loop.run_until_complete(generator.__anext__())
-                    if chunk.startswith(STREAM_END_SYMBOL):
-                        break
-                    else:
-                        data = json.loads(chunk[len(STREAM_CHUNK_START_SYMBOL) :])
-                        data["text"] = data["text"][offset:]
-                        offset += len(data["text"])
-                        yield data
-            # we cannot yield in the scope of generate() because python does not allow yield + return in the same function
-            # however, it allows to wrap the generator as a subfunction and return
-            return generator_wrapper()
-        else:
-            return ret
-    async def async_generate(
-        self,
-        # The input prompt. It can be a single prompt or a batch of prompts.
-        prompt: Optional[Union[List[str], str]] = None,
-        sampling_params: Optional[Dict] = None,
-        # The token ids for text; one can either specify text or input_ids.
-        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
-        return_logprob: Optional[Union[List[bool], bool]] = False,
-        logprob_start_len: Optional[Union[List[int], int]] = None,
-        top_logprobs_num: Optional[Union[List[int], int]] = None,
-        lora_path: Optional[List[Optional[str]]] = None,
-        stream: bool = False,
-    ):
-        obj = GenerateReqInput(
-            text=prompt,
-            input_ids=input_ids,
-            sampling_params=sampling_params,
-            return_logprob=return_logprob,
-            logprob_start_len=logprob_start_len,
-            top_logprobs_num=top_logprobs_num,
-            lora_path=lora_path,
-            stream=stream,
-        )
-        ret = await generate_request(obj, None)
-        if stream is True:
-            generator = ret.body_iterator
-            async def generator_wrapper():
-                offset = 0
-                while True:
-                    chunk = await generator.__anext__()
-                    if chunk.startswith(STREAM_END_SYMBOL):
-                        break
-                    else:
-                        data = json.loads(chunk[len(STREAM_CHUNK_START_SYMBOL) :])
-                        data["text"] = data["text"][offset:]
-                        offset += len(data["text"])
-                        yield data
-            return generator_wrapper()
-        else:
-            return ret
-    def shutdown(self):
-        kill_process_tree(os.getpid(), include_parent=False)
-    def get_tokenizer(self):
-        global tokenizer_manager
-        if tokenizer_manager is None:
-            raise ReferenceError("Tokenizer Manager is not initialized.")
-        else:
-            return tokenizer_manager.tokenizer
-    def encode(
-        self,
-        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
-    ):
-        obj = EmbeddingReqInput(text=prompt)
-        # get the current event loop
-        loop = asyncio.get_event_loop()
-        return loop.run_until_complete(encode_request(obj, None))
-    def start_profile(self):
-        tokenizer_manager.start_profile()
-    def stop_profile(self):
-        tokenizer_manager.stop_profile()
-    async def get_server_info(self):
-        return await _get_server_info()

sglang/srt/server_args.py CHANGED Viewed

@@ -20,6 +20,7 @@ import random
 import tempfile
 from typing import List, Optional
+from sglang.srt.hf_transformers_utils import check_gguf_file
 from sglang.srt.utils import (
     get_amdgpu_memory_capacity,
     get_nvgpu_memory_capacity,
@@ -49,6 +50,7 @@ class ServerArgs:
     served_model_name: Optional[str] = None
     chat_template: Optional[str] = None
     is_embedding: bool = False
+    revision: Optional[str] = None
     # Port
     host: str = "127.0.0.1"
@@ -58,7 +60,7 @@ class ServerArgs:
     mem_fraction_static: Optional[float] = None
     max_running_requests: Optional[int] = None
     max_total_tokens: Optional[int] = None
-    chunked_prefill_size: int = 8192
+    chunked_prefill_size: Optional[int] = None
     max_prefill_tokens: int = 16384
     schedule_policy: str = "lpm"
     schedule_conservativeness: float = 1.0
@@ -120,7 +122,7 @@ class ServerArgs:
     disable_jump_forward: bool = False
     disable_cuda_graph: bool = False
     disable_cuda_graph_padding: bool = False
-    disable_disk_cache: bool = False
+    disable_outlines_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
     disable_mla: bool = False
     disable_overlap_schedule: bool = False
@@ -128,7 +130,7 @@ class ServerArgs:
     enable_dp_attention: bool = False
     enable_torch_compile: bool = False
     torch_compile_max_bs: int = 32
-    cuda_graph_max_bs: int = 160
+    cuda_graph_max_bs: Optional[int] = None
     torchao_config: str = ""
     enable_nan_detection: bool = False
     enable_p2p_check: bool = False
@@ -144,19 +146,20 @@ class ServerArgs:
         if self.served_model_name is None:
             self.served_model_name = self.model_path
-        if self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0:
-            # Disable chunked prefill
-            self.chunked_prefill_size = None
         if self.random_seed is None:
             self.random_seed = random.randint(0, 1 << 30)
-        # Mem fraction depends on the tensor parallelism size
+        if is_hip():
+            gpu_mem = get_amdgpu_memory_capacity()
+        else:
+            gpu_mem = get_nvgpu_memory_capacity()
+        # Set mem fraction static, which depends on the tensor parallelism size
         if self.mem_fraction_static is None:
             if self.tp_size >= 16:
                 self.mem_fraction_static = 0.79
             elif self.tp_size >= 8:
-                self.mem_fraction_static = 0.82
+                self.mem_fraction_static = 0.81
             elif self.tp_size >= 4:
                 self.mem_fraction_static = 0.85
             elif self.tp_size >= 2:
@@ -164,25 +167,35 @@ class ServerArgs:
             else:
                 self.mem_fraction_static = 0.88
-        # Adjust for GPUs with small memory capacities
-        if is_hip():
-            gpu_mem = get_amdgpu_memory_capacity()
-        else:
-            gpu_mem = get_nvgpu_memory_capacity()
-        if gpu_mem < 25000:
-            self.chunked_prefill_size //= 4  # make it 2048
-            self.cuda_graph_max_bs = 4
-            logger.info("Automatically adjust --chunked-prefill-size for small GPUs.")
+        # Set chunked prefill size, which depends on the gpu memory capacity
+        if self.chunked_prefill_size is None:
+            if gpu_mem < 25_000:
+                self.chunked_prefill_size = 2048
+            else:
+                self.chunked_prefill_size = 8192
-        # Choose kernel backends
-        if not is_flashinfer_available():
-            self.attention_backend = "triton"
-            self.sampling_backend = "pytorch"
+        # Set cuda graph max batch size
+        if self.cuda_graph_max_bs is None:
+            if gpu_mem < 25_000:
+                self.cuda_graph_max_bs = 8
+            else:
+                self.cuda_graph_max_bs = 160
+        # Choose kernel backends
         if self.attention_backend is None:
-            self.attention_backend = "flashinfer"
+            self.attention_backend = (
+                "flashinfer" if is_flashinfer_available() else "triton"
+            )
         if self.sampling_backend is None:
-            self.sampling_backend = "flashinfer"
+            self.sampling_backend = (
+                "flashinfer" if is_flashinfer_available() else "pytorch"
+            )
+        if self.attention_backend == "torch_native":
+            logger.warning(
+                "Cuda graph is disabled because of using torch native attention backend"
+            )
+            self.disable_cuda_graph = True
         # Others
         if self.enable_dp_attention:
@@ -191,14 +204,20 @@ class ServerArgs:
             self.cuda_graph_max_bs = min(self.cuda_graph_max_bs, 96)
             self.schedule_conservativeness = self.schedule_conservativeness * 0.3
             self.disable_overlap_schedule = True
-            logger.info(
+            logger.warning(
                 f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
                 f"The CUDA graph max batch size is adjusted to {self.cuda_graph_max_bs}. "
                 f"The schedule conservativeness is adjusted to {self.schedule_conservativeness}. "
                 "Data parallel size is adjusted to be the same as tensor parallel size. "
-                "Overlap schedule is disabled."
+                "Overlap scheduler is disabled."
             )
+        # GGUF
+        if (
+            self.load_format == "auto" or self.load_format == "gguf"
+        ) and check_gguf_file(self.model_path):
+            self.quantization = self.load_format = "gguf"
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         # Model and port args
@@ -238,7 +257,7 @@ class ServerArgs:
             "--load-format",
             type=str,
             default=ServerArgs.load_format,
-            choices=["auto", "pt", "safetensors", "npcache", "dummy"],
+            choices=["auto", "pt", "safetensors", "npcache", "dummy", "gguf"],
             help="The format of the model weights to load. "
             '"auto" will try to load the weights in the safetensors format '
             "and fall back to the pytorch bin format if safetensors format "
@@ -248,7 +267,8 @@ class ServerArgs:
             '"npcache" will load the weights in pytorch format and store '
             "a numpy cache to speed up the loading. "
             '"dummy" will initialize the weights with random values, '
-            "which is mainly for profiling.",
+            "which is mainly for profiling."
+            '"gguf" will load the weights in the gguf format. ',
         )
         parser.add_argument(
             "--trust-remote-code",
@@ -288,6 +308,7 @@ class ServerArgs:
                 "gptq_marlin",
                 "awq_marlin",
                 "bitsandbytes",
+                "gguf",
             ],
             help="The quantization method.",
         )
@@ -321,6 +342,14 @@ class ServerArgs:
             action="store_true",
             help="Whether to use a CausalLM as an embedding model.",
         )
+        parser.add_argument(
+            "--revision",
+            type=str,
+            default=None,
+            help="The specific model version to use. It can be a branch "
+            "name, a tag name, or a commit id. If unspecified, will use "
+            "the default version.",
+        )
         # Memory and scheduling
         parser.add_argument(
@@ -572,7 +601,7 @@ class ServerArgs:
         parser.add_argument(
             "--attention-backend",
             type=str,
-            choices=["flashinfer", "triton"],
+            choices=["flashinfer", "triton", "torch_native"],
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
         )
@@ -613,9 +642,9 @@ class ServerArgs:
             help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
         )
         parser.add_argument(
-            "--disable-disk-cache",
+            "--disable-outlines-disk-cache",
             action="store_true",
-            help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
+            help="Disable disk cache of outlines to avoid possible crashes related to file system or high concurrency.",
         )
         parser.add_argument(
             "--disable-custom-all-reduce",
@@ -716,6 +745,11 @@ class ServerArgs:
             action=DeprecatedAction,
             help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
         )
+        parser.add_argument(
+            "--disable-disk-cache",
+            action=DeprecatedAction,
+            help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):

sglang 0.3.6.post3__py3-none-any.whl → 0.4.0__py3-none-any.whl

sglang 0.3.6.post3py3-none-any.whl → 0.4.0py3-none-any.whl