PyPI - sglang - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.1.post2__py3-none-any.whl - Mend

sglang 0.4.1py3-none-any.whl → 0.4.1.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

sglang/bench_offline_throughput.py +1 -0
sglang/bench_serving.py +11 -3
sglang/lang/backend/openai.py +10 -0
sglang/srt/configs/model_config.py +11 -2
sglang/srt/constrained/xgrammar_backend.py +6 -0
sglang/srt/layers/attention/__init__.py +0 -1
sglang/srt/layers/attention/flashinfer_backend.py +54 -41
sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -14
sglang/srt/layers/logits_processor.py +30 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +63 -30
sglang/srt/layers/moe/topk.py +14 -0
sglang/srt/layers/quantization/fp8.py +42 -2
sglang/srt/layers/quantization/fp8_kernel.py +91 -18
sglang/srt/layers/quantization/fp8_utils.py +8 -2
sglang/srt/managers/io_struct.py +29 -8
sglang/srt/managers/schedule_batch.py +22 -15
sglang/srt/managers/schedule_policy.py +1 -1
sglang/srt/managers/scheduler.py +71 -34
sglang/srt/managers/session_controller.py +102 -27
sglang/srt/managers/tokenizer_manager.py +95 -55
sglang/srt/managers/tp_worker.py +7 -0
sglang/srt/managers/tp_worker_overlap_thread.py +5 -0
sglang/srt/model_executor/forward_batch_info.py +42 -3
sglang/srt/model_executor/model_runner.py +4 -6
sglang/srt/model_loader/loader.py +22 -11
sglang/srt/models/gemma2.py +19 -0
sglang/srt/models/llama.py +13 -2
sglang/srt/models/llama_eagle.py +132 -0
sglang/srt/openai_api/adapter.py +79 -2
sglang/srt/openai_api/protocol.py +50 -0
sglang/srt/sampling/sampling_params.py +9 -2
sglang/srt/server.py +45 -39
sglang/srt/server_args.py +17 -30
sglang/srt/speculative/spec_info.py +19 -0
sglang/srt/utils.py +62 -0
sglang/version.py +1 -1
{sglang-0.4.1.dist-info → sglang-0.4.1.post2.dist-info}/METADATA +5 -5
{sglang-0.4.1.dist-info → sglang-0.4.1.post2.dist-info}/RECORD +41 -39
{sglang-0.4.1.dist-info → sglang-0.4.1.post2.dist-info}/LICENSE +0 -0
{sglang-0.4.1.dist-info → sglang-0.4.1.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.1.dist-info → sglang-0.4.1.post2.dist-info}/top_level.txt +0 -0

sglang/srt/models/llama_eagle.py ADDED Viewed

@@ -0,0 +1,132 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+# Adapted from
+# https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py
+"""Inference-only LLaMA-EAGLE model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Tuple
+import torch
+from torch import nn
+from transformers import LlamaConfig
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.llama import LlamaDecoderLayer, LlamaForCausalLM
+class LlamaDecoderLayer(LlamaDecoderLayer):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, layer_id, quant_config, prefix)
+        # Skip the input_layernorm
+        # https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427
+        if layer_id == 0:
+            del self.input_layernorm
+            setattr(self, "input_layernorm", lambda x: x)
+class LlamaModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayer(
+                    config, i, quant_config=quant_config, prefix=f"model.layers.{i}"
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.fc = torch.nn.Linear(config.hidden_size * 2, config.hidden_size)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        hidden_states = self.fc(
+            torch.cat((hidden_states, forward_batch.spec_info.hidden_states), dim=-1)
+        )
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        return hidden_states + residual
+class LlamaForCausalLMEagle(LlamaForCausalLM):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config=None,
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.quant_config = quant_config
+        self.model = LlamaModel(config, quant_config=quant_config)
+        # Llama 3.2 1B Instruct set tie_word_embeddings to True
+        # Llama 3.1 8B Instruct set tie_word_embeddings to False
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size, config.hidden_size, quant_config=quant_config
+            )
+        self.logits_processor = LogitsProcessor(config)
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+                super().load_weights([(name, loaded_weight)])
+EntryClass = [LlamaForCausalLMEagle]

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -65,10 +65,13 @@ from sglang.srt.openai_api.protocol import (
     FileDeleteResponse,
     FileRequest,
     FileResponse,
+    FunctionResponse,
     LogProbs,
+    ToolCall,
     TopLogprob,
     UsageInfo,
 )
+from sglang.srt.utils import TOOLS_TAG_LIST, parse_tool_response
 from sglang.utils import get_exception_traceback
 logger = logging.getLogger(__name__)
@@ -517,6 +520,7 @@ def v1_generate_request(
                 "repetition_penalty": request.repetition_penalty,
                 "regex": request.regex,
                 "json_schema": request.json_schema,
+                "ebnf": request.ebnf,
                 "n": request.n,
                 "no_stop_trim": request.no_stop_trim,
                 "ignore_eos": request.ignore_eos,
@@ -692,6 +696,14 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
 async def v1_completions(tokenizer_manager, raw_request: Request):
     request_json = await raw_request.json()
+    if "extra_body" in request_json:
+        extra = request_json["extra_body"]
+        if "ebnf" in extra:
+            request_json["ebnf"] = extra["ebnf"]
+        if "regex" in extra:
+            request_json["regex"] = extra["regex"]
+        # remove extra_body to avoid pydantic conflict
+        del request_json["extra_body"]
     all_requests = [CompletionRequest(**request_json)]
     adapted_request, request = v1_generate_request(all_requests)
@@ -870,6 +882,21 @@ def v1_chat_generate_request(
         #    None skips any image processing in GenerateReqInput.
         if not isinstance(request.messages, str):
             # Apply chat template and its stop strings.
+            tools = None
+            if request.tools and request.tool_choice != "none":
+                request.skip_special_tokens = False
+                if request.stream:
+                    logger.warning("Streaming is not supported with tools.")
+                    request.stream = False
+                if not isinstance(request.tool_choice, str):
+                    tools = [
+                        item.function.model_dump()
+                        for item in request.tools
+                        if item.function.name == request.tool_choice.function.name
+                    ]
+                else:
+                    tools = [item.function.model_dump() for item in request.tools]
             if chat_template_name is None:
                 openai_compatible_messages = []
                 for message in request.messages:
@@ -893,6 +920,7 @@ def v1_chat_generate_request(
                     openai_compatible_messages,
                     tokenize=True,
                     add_generation_prompt=True,
+                    tools=tools,
                 )
                 if assistant_prefix:
                     prompt_ids += tokenizer_manager.tokenizer.encode(assistant_prefix)
@@ -936,6 +964,7 @@ def v1_chat_generate_request(
             "frequency_penalty": request.frequency_penalty,
             "repetition_penalty": request.repetition_penalty,
             "regex": request.regex,
+            "ebnf": request.ebnf,
             "n": request.n,
             "no_stop_trim": request.no_stop_trim,
             "ignore_eos": request.ignore_eos,
@@ -1031,11 +1060,46 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
         finish_reason = ret_item["meta_info"]["finish_reason"]
+        tool_calls = None
+        text = ret_item["text"]
+        if isinstance(request, list):
+            tool_choice = request[idx].tool_choice
+            tools = request[idx].tools
+        else:
+            tool_choice = request.tool_choice
+            tools = request.tools
+        if tool_choice != "none" and any([i in text for i in TOOLS_TAG_LIST]):
+            if finish_reason == "stop":
+                finish_reason = "tool_calls"
+            try:
+                text, call_info_list = parse_tool_response(text, tools)  # noqa
+                tool_calls = [
+                    ToolCall(
+                        id=str(call_info[0]),
+                        function=FunctionResponse(
+                            name=call_info[1], arguments=call_info[2]
+                        ),
+                    )
+                    for call_info in call_info_list
+                ]
+            except Exception as e:
+                logger.error(f"Exception: {e}")
+                return create_error_response(
+                    HTTPStatus.BAD_REQUEST,
+                    "Failed to parse fc related info to json format!",
+                )
         if to_file:
             # to make the choice data json serializable
             choice_data = {
                 "index": 0,
-                "message": {"role": "assistant", "content": ret_item["text"]},
+                "message": {
+                    "role": "assistant",
+                    "content": ret_item["text"] if tool_calls is None else None,
+                    "tool_calls": tool_calls,
+                },
                 "logprobs": choice_logprobs,
                 "finish_reason": (finish_reason["type"] if finish_reason else ""),
                 "matched_stop": (
@@ -1047,7 +1111,11 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
         else:
             choice_data = ChatCompletionResponseChoice(
                 index=idx,
-                message=ChatMessage(role="assistant", content=ret_item["text"]),
+                message=ChatMessage(
+                    role="assistant",
+                    content=ret_item["text"] if tool_calls is None else None,
+                    tool_calls=tool_calls,
+                ),
                 logprobs=choice_logprobs,
                 finish_reason=(finish_reason["type"] if finish_reason else ""),
                 matched_stop=(
@@ -1108,6 +1176,15 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
 async def v1_chat_completions(tokenizer_manager, raw_request: Request):
     request_json = await raw_request.json()
+    if "extra_body" in request_json:
+        extra = request_json["extra_body"]
+        # For example, if 'ebnf' is given:
+        if "ebnf" in extra:
+            request_json["ebnf"] = extra["ebnf"]
+        if "regex" in extra:
+            request_json["regex"] = extra["regex"]
+        # remove extra_body to avoid pydantic conflict
+        del request_json["extra_body"]
     all_requests = [ChatCompletionRequest(**request_json)]
     adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -179,6 +179,7 @@ class CompletionRequest(BaseModel):
     ignore_eos: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    ebnf: Optional[str] = None
 class CompletionResponseChoice(BaseModel):
@@ -256,6 +257,34 @@ class ResponseFormat(BaseModel):
     json_schema: Optional[JsonSchemaResponseFormat] = None
+class Function(BaseModel):
+    """Function descriptions."""
+    description: Optional[str] = Field(default=None, examples=[None])
+    name: str
+    parameters: Optional[object] = None
+class Tool(BaseModel):
+    """Function wrapper."""
+    type: str = Field(default="function", examples=["function"])
+    function: Function
+class ToolChoiceFuncName(BaseModel):
+    """The name of tool choice function."""
+    name: str
+class ToolChoice(BaseModel):
+    """The tool choice definition."""
+    function: ToolChoiceFuncName
+    type: Literal["function"] = Field(default="function", examples=["function"])
 class ChatCompletionRequest(BaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/chat/create
@@ -276,6 +305,10 @@ class ChatCompletionRequest(BaseModel):
     temperature: float = 0.7
     top_p: float = 1.0
     user: Optional[str] = None
+    tools: Optional[List[Tool]] = Field(default=None, examples=[None])
+    tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
+        default="auto", examples=["none"]
+    )  # noqa
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
     top_k: int = -1
@@ -288,11 +321,28 @@ class ChatCompletionRequest(BaseModel):
     ignore_eos: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    ebnf: Optional[str] = None
+class FunctionResponse(BaseModel):
+    """Function response."""
+    name: str
+    arguments: str
+class ToolCall(BaseModel):
+    """Tool call response."""
+    id: str
+    type: Literal["function"] = "function"
+    function: FunctionResponse
 class ChatMessage(BaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
 class ChatCompletionResponseChoice(BaseModel):

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -36,6 +36,7 @@ class SamplingParams:
         regex: Optional[str] = None,
         n: int = 1,
         json_schema: Optional[str] = None,
+        ebnf: Optional[str] = None,
         no_stop_trim: bool = False,
         ignore_eos: bool = False,
         skip_special_tokens: bool = True,
@@ -60,6 +61,7 @@ class SamplingParams:
         self.regex = regex
         self.n = n
         self.json_schema = json_schema
+        self.ebnf = ebnf
         self.no_stop_trim = no_stop_trim
         # Process some special cases
@@ -111,8 +113,13 @@ class SamplingParams:
                     f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
                     f"{self.min_new_tokens}."
                 )
-        if self.regex is not None and self.json_schema is not None:
-            raise ValueError("regex and json_schema cannot be both set.")
+        grammars = [
+            self.json_schema,
+            self.regex,
+            self.ebnf,
+        ]  # since mutually exclusive, only one can be set
+        if sum(x is not None for x in grammars) > 1:
+            raise ValueError("Only one of regex, json_schema, or ebnf can be set.")
     def normalize(self, tokenizer):
         # Process stop strings

sglang/srt/server.py CHANGED Viewed

@@ -57,6 +57,7 @@ from sglang.srt.managers.io_struct import (
     OpenSessionReqInput,
     UpdateWeightFromDiskReqInput,
     UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromTensorReqInput,
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
@@ -109,6 +110,7 @@ app.add_middleware(
 tokenizer_manager: TokenizerManager = None
 scheduler_info: Dict = None
 ##### Native API endpoints #####
@@ -245,16 +247,11 @@ async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
     try:
         ret = await tokenizer_manager.get_weights_by_name(obj, request)
         if ret is None:
-            return ORJSONResponse(
-                {"error": {"message": "Get parameter by name failed"}},
-                status_code=HTTPStatus.BAD_REQUEST,
-            )
+            return _create_error_response("Get parameter by name failed")
         else:
             return ORJSONResponse(ret, status_code=200)
     except Exception as e:
-        return ORJSONResponse(
-            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-        )
+        return _create_error_response(e)
 @app.api_route("/open_session", methods=["GET", "POST"])
@@ -262,11 +259,13 @@ async def open_session(obj: OpenSessionReqInput, request: Request):
     """Open a session, and return its unique session id."""
     try:
         session_id = await tokenizer_manager.open_session(obj, request)
+        if session_id is None:
+            raise Exception(
+                "Failed to open the session. Check if a session with the same id is still open."
+            )
         return session_id
     except Exception as e:
-        return ORJSONResponse(
-            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-        )
+        return _create_error_response(e)
 @app.api_route("/close_session", methods=["GET", "POST"])
@@ -276,9 +275,7 @@ async def close_session(obj: CloseSessionReqInput, request: Request):
         await tokenizer_manager.close_session(obj, request)
         return Response(status_code=200)
     except Exception as e:
-        return ORJSONResponse(
-            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-        )
+        return _create_error_response(e)
 # fastapi implicitly converts json in the request to obj (dataclass)
@@ -312,9 +309,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
             return ret
         except ValueError as e:
             logger.error(f"Error: {e}")
-            return ORJSONResponse(
-                {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-            )
+            return _create_error_response(e)
 @app.api_route("/encode", methods=["POST", "PUT"])
@@ -325,9 +320,7 @@ async def encode_request(obj: EmbeddingReqInput, request: Request):
         ret = await tokenizer_manager.generate_request(obj, request).__anext__()
         return ret
     except ValueError as e:
-        return ORJSONResponse(
-            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-        )
+        return _create_error_response(e)
 @app.api_route("/classify", methods=["POST", "PUT"])
@@ -338,9 +331,7 @@ async def classify_request(obj: EmbeddingReqInput, request: Request):
         ret = await tokenizer_manager.generate_request(obj, request).__anext__()
         return ret
     except ValueError as e:
-        return ORJSONResponse(
-            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-        )
+        return _create_error_response(e)
 ##### OpenAI-compatible API endpoints #####
@@ -416,6 +407,12 @@ async def retrieve_file_content(file_id: str):
     return await v1_retrieve_file_content(file_id)
+def _create_error_response(e):
+    return ORJSONResponse(
+        {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
+    )
 def launch_engine(
     server_args: ServerArgs,
 ):
@@ -493,7 +490,16 @@ def launch_engine(
     # Wait for model to finish loading
     scheduler_infos = []
     for i in range(len(scheduler_pipe_readers)):
-        data = scheduler_pipe_readers[i].recv()
+        try:
+            data = scheduler_pipe_readers[i].recv()
+        except EOFError as e:
+            logger.exception(e)
+            logger.error(
+                f"Rank {i} scheduler is dead. Please check if there are relevant logs."
+            )
+            scheduler_procs[i].join()
+            logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
+            raise
         if data["status"] != "ready":
             raise RuntimeError(
@@ -501,7 +507,7 @@ def launch_engine(
             )
         scheduler_infos.append(data)
-    # Assume all schedulers have same max_total_num_tokens
+    # Assume all schedulers have same scheduler_info
     scheduler_info = scheduler_infos[0]
@@ -849,12 +855,10 @@ class Engine:
             group_name=group_name,
             backend=backend,
         )
-        async def _init_group():
-            return await tokenizer_manager.init_weights_update_group(obj, None)
         loop = asyncio.get_event_loop()
-        return loop.run_until_complete(_init_group())
+        return loop.run_until_complete(
+            tokenizer_manager.init_weights_update_group(obj, None)
+        )
     def update_weights_from_distributed(self, name, dtype, shape):
         """Update weights from distributed source."""
@@ -863,22 +867,24 @@ class Engine:
             dtype=dtype,
             shape=shape,
         )
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            tokenizer_manager.update_weights_from_distributed(obj, None)
+        )
-        async def _update_weights():
-            return await tokenizer_manager.update_weights_from_distributed(obj, None)
+    def update_weights_from_tensor(self, name, tensor):
+        """Update weights from distributed source."""
+        obj = UpdateWeightsFromTensorReqInput(name=name, tensor=tensor)
         loop = asyncio.get_event_loop()
-        return loop.run_until_complete(_update_weights())
+        return loop.run_until_complete(
+            tokenizer_manager.update_weights_from_tensor(obj, None)
+        )
     def get_weights_by_name(self, name, truncate_size=100):
         """Get weights by parameter name."""
         obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
-        async def _get_weights():
-            return await tokenizer_manager.get_weights_by_name(obj, None)
         loop = asyncio.get_event_loop()
-        return loop.run_until_complete(_get_weights())
+        return loop.run_until_complete(tokenizer_manager.get_weights_by_name(obj, None))
 class Runtime:
@@ -888,7 +894,7 @@ class Runtime:
     using the commond line interface.
     It is mainly used for the frontend language.
-    You should use the Engine class if you want to do normal offline processing.
+    You should use the Engine class above if you want to do normal offline processing.
     """
     def __init__(

sglang/srt/server_args.py CHANGED Viewed

@@ -55,7 +55,7 @@ class ServerArgs:
     is_embedding: bool = False
     revision: Optional[str] = None
-    # Port
+    # Port for the HTTP server
     host: str = "127.0.0.1"
     port: int = 30000
@@ -68,6 +68,7 @@ class ServerArgs:
     schedule_policy: str = "lpm"
     schedule_conservativeness: float = 1.0
     cpu_offload_gb: int = 0
+    prefill_only_one_req: bool = False
     # Other runtime options
     tp_size: int = 1
@@ -94,6 +95,7 @@ class ServerArgs:
     # Data parallelism
     dp_size: int = 1
     load_balance_method: str = "round_robin"
     # Expert parallelism
     ep_size: int = 1
@@ -217,6 +219,13 @@ class ServerArgs:
             )
             self.disable_cuda_graph = True
+        # Expert parallelism
+        if self.enable_ep_moe:
+            self.ep_size = self.tp_size
+            logger.info(
+                f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
+            )
         # Others
         if self.enable_dp_attention:
             self.dp_size = self.tp_size
@@ -229,12 +238,6 @@ class ServerArgs:
                 "Data parallel size is adjusted to be the same as tensor parallel size. "
                 "Overlap scheduler is disabled."
             )
-        # Expert parallelism
-        if self.enable_ep_moe:
-            self.ep_size = self.tp_size
-            logger.info(
-                f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
-            )
         # GGUF
         if (
@@ -430,13 +433,18 @@ class ServerArgs:
             default=ServerArgs.schedule_conservativeness,
             help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
         )
         parser.add_argument(
             "--cpu-offload-gb",
             type=int,
             default=ServerArgs.cpu_offload_gb,
             help="How many GBs of RAM to reserve for CPU offloading",
         )
+        parser.add_argument(
+            "--prefill-only-one-req",
+            type=bool,
+            help="If true, we only prefill one request at one prefill batch",
+            default=ServerArgs.prefill_only_one_req,
+        )
         # Other runtime options
         parser.add_argument(
@@ -555,6 +563,7 @@ class ServerArgs:
                 "shortest_queue",
             ],
         )
         # Expert parallelism
         parser.add_argument(
             "--expert-parallel-size",
@@ -777,28 +786,6 @@ class ServerArgs:
             help="Delete the model checkpoint after loading the model.",
         )
-        # Deprecated arguments
-        parser.add_argument(
-            "--enable-overlap-schedule",
-            action=DeprecatedAction,
-            help="'--enable-overlap-schedule' is deprecated. It is enabled by default now. Please drop this argument.",
-        )
-        parser.add_argument(
-            "--disable-flashinfer",
-            action=DeprecatedAction,
-            help="'--disable-flashinfer' is deprecated. Please use '--attention-backend triton' instead.",
-        )
-        parser.add_argument(
-            "--disable-flashinfer-sampling",
-            action=DeprecatedAction,
-            help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
-        )
-        parser.add_argument(
-            "--disable-disk-cache",
-            action=DeprecatedAction,
-            help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.",
-        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
         args.tp_size = args.tensor_parallel_size

sglang 0.4.1__py3-none-any.whl → 0.4.1.post2__py3-none-any.whl

sglang 0.4.1py3-none-any.whl → 0.4.1.post2py3-none-any.whl