PyPI - sglang - Versions diffs - 0.2.15__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

sglang 0.2.15py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

sglang/bench_latency.py +10 -6
sglang/bench_serving.py +33 -38
sglang/global_config.py +0 -4
sglang/lang/backend/runtime_endpoint.py +13 -6
sglang/lang/interpreter.py +1 -1
sglang/launch_server.py +3 -6
sglang/launch_server_llavavid.py +7 -8
sglang/srt/{model_config.py → configs/model_config.py} +5 -0
sglang/srt/constrained/__init__.py +2 -0
sglang/srt/constrained/fsm_cache.py +29 -38
sglang/srt/constrained/jump_forward.py +0 -1
sglang/srt/conversation.py +4 -1
sglang/srt/hf_transformers_utils.py +2 -4
sglang/srt/layers/attention_backend.py +480 -0
sglang/srt/layers/flashinfer_utils.py +235 -0
sglang/srt/layers/logits_processor.py +64 -77
sglang/srt/layers/radix_attention.py +11 -161
sglang/srt/layers/sampler.py +40 -35
sglang/srt/layers/torchao_utils.py +75 -0
sglang/srt/layers/{decode_attention.py → triton_attention/decode_attention.py} +67 -63
sglang/srt/layers/{extend_attention.py → triton_attention/extend_attention.py} +40 -132
sglang/srt/layers/{prefill_attention.py → triton_attention/prefill_attention.py} +13 -7
sglang/srt/lora/lora.py +403 -0
sglang/srt/lora/lora_config.py +43 -0
sglang/srt/lora/lora_manager.py +256 -0
sglang/srt/managers/controller_multi.py +1 -5
sglang/srt/managers/controller_single.py +0 -5
sglang/srt/managers/io_struct.py +16 -1
sglang/srt/managers/policy_scheduler.py +122 -5
sglang/srt/managers/schedule_batch.py +110 -74
sglang/srt/managers/tokenizer_manager.py +24 -15
sglang/srt/managers/tp_worker.py +181 -115
sglang/srt/model_executor/cuda_graph_runner.py +60 -133
sglang/srt/model_executor/forward_batch_info.py +35 -312
sglang/srt/model_executor/model_runner.py +118 -141
sglang/srt/models/baichuan.py +416 -0
sglang/srt/models/chatglm.py +6 -8
sglang/srt/models/commandr.py +1 -5
sglang/srt/models/dbrx.py +1 -5
sglang/srt/models/deepseek.py +1 -5
sglang/srt/models/deepseek_v2.py +1 -5
sglang/srt/models/exaone.py +8 -43
sglang/srt/models/gemma.py +1 -5
sglang/srt/models/gemma2.py +1 -5
sglang/srt/models/gpt_bigcode.py +1 -5
sglang/srt/models/grok.py +1 -5
sglang/srt/models/internlm2.py +1 -5
sglang/srt/models/{llama2.py → llama.py} +48 -26
sglang/srt/models/llama_classification.py +14 -40
sglang/srt/models/llama_embedding.py +7 -6
sglang/srt/models/llava.py +38 -16
sglang/srt/models/llavavid.py +7 -8
sglang/srt/models/minicpm.py +1 -5
sglang/srt/models/minicpm3.py +665 -0
sglang/srt/models/mistral.py +2 -3
sglang/srt/models/mixtral.py +6 -5
sglang/srt/models/mixtral_quant.py +1 -5
sglang/srt/models/qwen.py +1 -5
sglang/srt/models/qwen2.py +1 -5
sglang/srt/models/qwen2_moe.py +6 -5
sglang/srt/models/stablelm.py +1 -5
sglang/srt/models/xverse.py +375 -0
sglang/srt/models/xverse_moe.py +445 -0
sglang/srt/openai_api/adapter.py +65 -46
sglang/srt/openai_api/protocol.py +11 -3
sglang/srt/sampling/sampling_batch_info.py +67 -58
sglang/srt/server.py +24 -14
sglang/srt/server_args.py +130 -28
sglang/srt/utils.py +12 -0
sglang/test/few_shot_gsm8k.py +132 -0
sglang/test/runners.py +114 -22
sglang/test/test_programs.py +70 -0
sglang/test/test_utils.py +89 -1
sglang/utils.py +38 -4
sglang/version.py +1 -1
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/METADATA +31 -18
sglang-0.3.1.dist-info/RECORD +129 -0
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/WHEEL +1 -1
sglang-0.2.15.dist-info/RECORD +0 -118
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/LICENSE +0 -0
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/top_level.txt +0 -0

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -82,6 +82,14 @@ class StreamOptions(BaseModel):
     include_usage: Optional[bool] = False
+class JsonSchemaResponseFormat(BaseModel):
+    name: str
+    description: Optional[str] = None
+    # use alias to workaround pydantic conflict
+    schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
+    strict: Optional[bool] = False
 class FileRequest(BaseModel):
     # https://platform.openai.com/docs/api-reference/files/create
     file: bytes  # The File object (not file name) to be uploaded
@@ -213,6 +221,7 @@ class ChatCompletionMessageContentImageURL(BaseModel):
 class ChatCompletionMessageContentImagePart(BaseModel):
     type: Literal["image_url"]
     image_url: ChatCompletionMessageContentImageURL
+    modalities: Optional[Literal["image", "multi-images", "video"]] = "image"
 ChatCompletionMessageContentPart = Union[
@@ -236,8 +245,8 @@ ChatCompletionMessageParam = Union[
 class ResponseFormat(BaseModel):
-    # type must be "json_object" or "text"
-    type: Literal["text", "json_object"]
+    type: Literal["text", "json_object", "json_schema"]
+    json_schema: Optional[JsonSchemaResponseFormat] = None
 class ChatCompletionRequest(BaseModel):
@@ -263,7 +272,6 @@ class ChatCompletionRequest(BaseModel):
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
     regex: Optional[str] = None
-    json_schema: Optional[str] = None
     min_tokens: Optional[int] = 0
     repetition_penalty: Optional[float] = 1.0
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -34,70 +34,76 @@ class SamplingBatchInfo:
     linear_penalties: torch.Tensor = None
     scaling_penalties: torch.Tensor = None
-    def has_bias(self):
+    def __len__(self):
+        return len(self.temperatures)
+    def can_run_in_cuda_graph(self):
+        # Vocab bias and min_ps are not supported in CUDA graph
         return (
-            self.logit_bias is not None
-            or self.vocab_mask is not None
-            or self.linear_penalties is not None
-            or self.scaling_penalties is not None
+            self.logit_bias is None
+            and self.linear_penalties is None
+            and self.scaling_penalties is None
+            and not self.need_min_p_sampling
         )
     @classmethod
     def dummy_one(cls, max_bs: int, vocab_size: int):
         ret = cls(vocab_size=vocab_size)
-        ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
-        ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
-        ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
-        ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
+        with torch.device("cuda"):
+            ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float)
+            ret.top_ps = torch.ones((max_bs,), dtype=torch.float)
+            ret.top_ks = torch.ones((max_bs,), dtype=torch.int)
+            ret.vocab_mask = torch.zeros((max_bs, vocab_size), dtype=torch.bool)
         return ret
     def __getitem__(self, key):
         if isinstance(key, slice):
-            # NOTE: We do not use cuda graph when there is bias tensors
-            assert not self.has_bias()
+            # NOTE:This method is only used in CUDA graph
+            assert self.can_run_in_cuda_graph()
             return SamplingBatchInfo(
                 vocab_size=self.vocab_size,
                 temperatures=self.temperatures[key],
                 top_ps=self.top_ps[key],
                 top_ks=self.top_ks[key],
-                min_ps=self.min_ps[key],
-                need_min_p_sampling=self.need_min_p_sampling,
+                vocab_mask=self.vocab_mask[key],
             )
         else:
             raise NotImplementedError
     def inplace_assign(self, bs: int, other: SamplingBatchInfo):
-        # NOTE: We do not use cuda graph when there is bias tensors
-        assert not self.has_bias()
+        # NOTE:This method is only used in CUDA graph
+        assert self.can_run_in_cuda_graph()
         self.vocab_size = other.vocab_size
-        self.need_min_p_sampling = other.need_min_p_sampling
         self.temperatures[:bs] = other.temperatures
         self.top_ps[:bs] = other.top_ps
         self.top_ks[:bs] = other.top_ks
-        self.min_ps[:bs] = other.min_ps
+        if other.vocab_mask is None:
+            self.vocab_mask[:bs].fill_(False)
+        else:
+            self.vocab_mask[:bs] = other.vocab_mask
     @classmethod
     def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
-        device = "cuda"
         reqs = batch.reqs
         ret = cls(vocab_size=vocab_size)
-        ret.temperatures = torch.tensor(
-            [r.sampling_params.temperature for r in reqs],
-            dtype=torch.float,
-            device=device,
-        ).view(-1, 1)
-        ret.top_ps = torch.tensor(
-            [r.sampling_params.top_p for r in reqs], dtype=torch.float, device=device
-        )
-        ret.top_ks = torch.tensor(
-            [r.sampling_params.top_k for r in reqs], dtype=torch.int, device=device
-        )
-        ret.min_ps = torch.tensor(
-            [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
-        )
+        with torch.device("cuda"):
+            ret.temperatures = torch.tensor(
+                [r.sampling_params.temperature for r in reqs],
+                dtype=torch.float,
+            ).view(-1, 1)
+            ret.top_ps = torch.tensor(
+                [r.sampling_params.top_p for r in reqs], dtype=torch.float
+            )
+            ret.top_ks = torch.tensor(
+                [r.sampling_params.top_k for r in reqs], dtype=torch.int
+            )
+            ret.min_ps = torch.tensor(
+                [r.sampling_params.min_p for r in reqs], dtype=torch.float
+            )
         ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
         # Each penalizers will do nothing if they evaluate themselves as not required by looking at
@@ -110,7 +116,7 @@ class SamplingBatchInfo:
         ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
             vocab_size=vocab_size,
             batch=batch,
-            device=device,
+            device="cuda",
             Penalizers={
                 penaltylib.BatchedFrequencyPenalizer,
                 penaltylib.BatchedMinNewTokensPenalizer,
@@ -122,11 +128,9 @@ class SamplingBatchInfo:
         # Handle logit bias but only allocate when needed
         ret.logit_bias = None
-        ret.update_regex_vocab_mask(batch)
         return ret
-    def prepare_penalties(self):
+    def update_penalties(self):
         self.scaling_penalties = None
         self.linear_penalties = None
@@ -146,18 +150,16 @@ class SamplingBatchInfo:
                     self.linear_penalties = penalizer.apply(self.linear_penalties)
     def update_regex_vocab_mask(self, batch: ScheduleBatch):
-        bs, reqs = batch.batch_size(), batch.reqs
-        device = "cuda"
-        has_regex = any(req.regex_fsm is not None for req in reqs)
+        has_regex = any(req.regex_fsm is not None for req in batch.reqs)
         # Reset the vocab mask
         self.vocab_mask = None
         if has_regex:
             self.vocab_mask = torch.zeros(
-                bs, self.vocab_size, dtype=torch.bool, device=device
+                batch.batch_size(), self.vocab_size, dtype=torch.bool, device="cuda"
             )
-            for i, req in enumerate(reqs):
+            for i, req in enumerate(batch.reqs):
                 if req.regex_fsm is not None:
                     self.vocab_mask[i].fill_(1)
                     self.vocab_mask[i][
@@ -178,6 +180,26 @@ class SamplingBatchInfo:
             if self_val is not None:  # logit_bias can be None
                 setattr(self, item, self_val[new_indices])
+    @staticmethod
+    def merge_bias_tensor(
+        lhs: torch.Tensor, rhs: torch.Tensor, bs1: int, bs2: int, default: int = 0
+    ):
+        # bias tensor can be None
+        if lhs is not None or rhs is not None:
+            shape, dtype = None, None
+            if lhs is not None:
+                shape, dtype = lhs.shape[1:], lhs.dtype
+            else:
+                shape, dtype = rhs.shape[1:], rhs.dtype
+            with torch.dtype(dtype):
+                if lhs is None:
+                    lhs = torch.empty((bs1, *shape), device="cuda").fill_(default)
+                if rhs is None:
+                    rhs = torch.empty((bs2, *shape), device="cuda").fill_(default)
+            return torch.cat([lhs, rhs])
+        return None
     def merge(self, other: "SamplingBatchInfo"):
         self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
@@ -191,19 +213,6 @@ class SamplingBatchInfo:
             other_val = getattr(other, item, None)
             setattr(self, item, torch.concat([self_val, other_val]))
-        # logit_bias can be None
-        if self.logit_bias is not None or other.logit_bias is not None:
-            vocab_size = (
-                self.logit_bias.shape[1]
-                if self.logit_bias is not None
-                else other.logit_bias.shape[1]
-            )
-            if self.logit_bias is None:
-                self.logit_bias = torch.zeros(
-                    (len(self.reqs), vocab_size), dtype=torch.float32, device="cuda"
-                )
-            if other.logit_bias is None:
-                other.logit_bias = torch.zeros(
-                    (len(other.reqs), vocab_size), dtype=torch.float32, device="cuda"
-                )
-            self.logit_bias = torch.concat([self.logit_bias, other.logit_bias])
+        self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
+            self.logit_bias, other.logit_bias, len(self), len(other)
+        )

sglang/srt/server.py CHANGED Viewed

@@ -37,6 +37,7 @@ import requests
 import uvicorn
 import uvloop
 from fastapi import FastAPI, File, Form, Request, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
@@ -93,6 +94,14 @@ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 app = FastAPI()
 tokenizer_manager = None
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
 @app.get("/health")
 async def health() -> Response:
@@ -272,7 +281,6 @@ async def retrieve_file_content(file_id: str):
 def launch_server(
     server_args: ServerArgs,
-    model_override_args: Optional[dict] = None,
     pipe_finish_writer: Optional[mp.connection.Connection] = None,
 ):
     """Launch an HTTP server."""
@@ -317,7 +325,6 @@ def launch_server(
             tp_rank_range,
             server_args,
             ports[3],
-            model_override_args,
         )
         try:
@@ -328,23 +335,19 @@ def launch_server(
             return
     # Launch processes
-    tokenizer_manager = TokenizerManager(server_args, port_args, model_override_args)
-    if server_args.chat_template:
-        load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
     pipe_controller_reader, pipe_controller_writer = mp.Pipe(duplex=False)
-    pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False)
     if server_args.dp_size == 1:
         start_controller_process = start_controller_process_single
     else:
         start_controller_process = start_controller_process_multi
     proc_controller = mp.Process(
         target=start_controller_process,
-        args=(server_args, port_args, pipe_controller_writer, model_override_args),
+        args=(server_args, port_args, pipe_controller_writer),
     )
     proc_controller.start()
+    pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False)
     proc_detoken = mp.Process(
         target=start_detokenizer_process,
         args=(
@@ -355,6 +358,10 @@ def launch_server(
     )
     proc_detoken.start()
+    tokenizer_manager = TokenizerManager(server_args, port_args)
+    if server_args.chat_template:
+        load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
     # Wait for the model to finish loading
     controller_init_state = pipe_controller_reader.recv()
     detoken_init_state = pipe_detoken_reader.recv()
@@ -418,7 +425,7 @@ def _set_envs_and_config(server_args: ServerArgs):
         maybe_set_triton_cache_manager()
     # Check flashinfer version
-    if not server_args.disable_flashinfer:
+    if server_args.attention_backend == "flashinfer":
         assert_pkg_version(
             "flashinfer",
             "0.1.6",
@@ -440,13 +447,12 @@ def _wait_and_warmup(server_args, pipe_finish_writer, pid):
         time.sleep(1)
         try:
             res = requests.get(url + "/get_model_info", timeout=5, headers=headers)
-            assert res.status_code == 200, f"{res}"
+            assert res.status_code == 200, f"{res=}, {res.text=}"
             success = True
             break
-        except (AssertionError, requests.exceptions.RequestException) as e:
+        except (AssertionError, requests.exceptions.RequestException):
             last_traceback = get_exception_traceback()
             pass
-    model_info = res.json()
     if not success:
         if pipe_finish_writer is not None:
@@ -455,6 +461,8 @@ def _wait_and_warmup(server_args, pipe_finish_writer, pid):
         kill_child_process(pid, including_parent=False)
         return
+    model_info = res.json()
     # Send a warmup request
     request_name = "/generate" if model_info["is_generation"] else "/encode"
     max_new_tokens = 8 if model_info["is_generation"] else 1
@@ -501,7 +509,6 @@ class Runtime:
     def __init__(
         self,
         log_level: str = "error",
-        model_override_args: Optional[dict] = None,
         *args,
         **kwargs,
     ):
@@ -525,7 +532,7 @@ class Runtime:
         proc = mp.Process(
             target=launch_server,
-            args=(self.server_args, model_override_args, pipe_writer),
+            args=(self.server_args, pipe_writer),
         )
         proc.start()
         pipe_writer.close()
@@ -604,6 +611,7 @@ class Runtime:
         return_logprob: Optional[Union[List[bool], bool]] = False,
         logprob_start_len: Optional[Union[List[int], int]] = None,
         top_logprobs_num: Optional[Union[List[int], int]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
     ):
         json_data = {
             "text": prompt,
@@ -611,7 +619,9 @@ class Runtime:
             "return_logprob": return_logprob,
             "logprob_start_len": logprob_start_len,
             "top_logprobs_num": top_logprobs_num,
+            "lora_path": lora_path,
         }
+        assert not isinstance(lora_path, list) or len(lora_path) == len(prompt)
         response = requests.post(
             self.url + "/generate",
             json=json_data,

sglang/srt/server_args.py CHANGED Viewed

@@ -49,7 +49,6 @@ class ServerArgs:
     # Memory and scheduling
     mem_fraction_static: Optional[float] = None
     max_running_requests: Optional[int] = None
-    max_num_reqs: Optional[int] = None
     max_total_tokens: Optional[int] = None
     chunked_prefill_size: int = 8192
     max_prefill_tokens: int = 16384
@@ -75,7 +74,18 @@ class ServerArgs:
     dp_size: int = 1
     load_balance_method: str = "round_robin"
+    # Distributed args
+    nccl_init_addr: Optional[str] = None
+    nnodes: int = 1
+    node_rank: Optional[int] = None
+    # Model override args in JSON
+    json_model_override_args: str = "{}"
     # Optimization/debug options
+    attention_backend: Optional[str] = None
+    sampling_backend: Optional[str] = None
     disable_flashinfer: bool = False
     disable_flashinfer_sampling: bool = False
     disable_radix_cache: bool = False
@@ -86,16 +96,17 @@ class ServerArgs:
     disable_custom_all_reduce: bool = False
     enable_mixed_chunk: bool = False
     enable_torch_compile: bool = False
+    torchao_config: str = ""
     enable_p2p_check: bool = False
     enable_mla: bool = False
     triton_attention_reduce_in_fp32: bool = False
-    # Distributed args
-    nccl_init_addr: Optional[str] = None
-    nnodes: int = 1
-    node_rank: Optional[int] = None
+    # LoRA
+    lora_paths: Optional[List[str]] = None
+    max_loras_per_batch: int = 8
     def __post_init__(self):
+        # Set missing default values
         if self.tokenizer_path is None:
             self.tokenizer_path = self.model_path
@@ -106,6 +117,7 @@ class ServerArgs:
             # Disable chunked prefill
             self.chunked_prefill_size = None
+        # Mem fraction depends on the tensor parallelism size
         if self.mem_fraction_static is None:
             if self.tp_size >= 16:
                 self.mem_fraction_static = 0.79
@@ -126,6 +138,42 @@ class ServerArgs:
         if self.random_seed is None:
             self.random_seed = random.randint(0, 1 << 30)
+        # Deprecation warnings
+        if self.disable_flashinfer:
+            logger.warning(
+                "The option '--disable-flashinfer' will be deprecated in the next release. "
+                "Please use '--attention-backend triton' instead."
+            )
+            self.attention_backend = "triton"
+        if self.disable_flashinfer_sampling:
+            logger.warning(
+                "The option '--disable-flashinfer-sampling' will be deprecated in the next release. "
+                "Please use '--sampling-backend pytorch' instead. "
+            )
+            self.sampling_backend = "pytorch"
+        # Default kernel backends
+        if self.enable_mla:
+            logger.info("MLA optimization is tunred on. Use triton backend.")
+            self.attention_backend = "triton"
+        if self.attention_backend is None:
+            self.attention_backend = "flashinfer"
+        if self.sampling_backend is None:
+            self.sampling_backend = "flashinfer"
+        # Model-specific patches
+        if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
+            logger.info(
+                "Not sure why, the tokenizer will add an additional token at the end of the prompt when trust_remote_mode=True"
+            )
+            self.trust_remote_code = False
+        if "gemma-2" in self.model_path.lower():
+            logger.info("When using sliding window in gemma-2, turn on flashinfer.")
+            self.attention_backend = "flashinfer"
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         parser.add_argument(
@@ -209,11 +257,6 @@ class ServerArgs:
             action="store_true",
             help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
         )
-        parser.add_argument(
-            "--is-embedding",
-            action="store_true",
-            help="Whether to use a CausalLM as an embedding model.",
-        )
         parser.add_argument(
             "--context-length",
             type=int,
@@ -248,6 +291,11 @@ class ServerArgs:
             default=ServerArgs.chat_template,
             help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
         )
+        parser.add_argument(
+            "--is-embedding",
+            action="store_true",
+            help="Whether to use a CausalLM as an embedding model.",
+        )
         parser.add_argument(
             "--mem-fraction-static",
             type=float,
@@ -260,17 +308,12 @@ class ServerArgs:
             default=ServerArgs.max_running_requests,
             help="The maximum number of running requests.",
         )
-        parser.add_argument(
-            "--max-num-reqs",
-            type=int,
-            default=ServerArgs.max_num_reqs,
-            help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
-        )
         parser.add_argument(
             "--max-total-tokens",
             type=int,
             default=ServerArgs.max_total_tokens,
-            help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
+            help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. "
+            "This option is typically used for development and debugging purposes.",
         )
         parser.add_argument(
             "--chunked-prefill-size",
@@ -381,16 +424,38 @@ class ServerArgs:
         )
         parser.add_argument("--node-rank", type=int, help="The node rank.")
+        # Model override args
+        parser.add_argument(
+            "--json-model-override-args",
+            type=str,
+            help="A dictionary in JSON string format used to override default model configurations.",
+            default=ServerArgs.json_model_override_args,
+        )
         # Optimization/debug options
+        parser.add_argument(
+            "--attention-backend",
+            type=str,
+            choices=["flashinfer", "triton"],
+            default=ServerArgs.attention_backend,
+            help="Choose the kernels for attention layers.",
+        )
+        parser.add_argument(
+            "--sampling-backend",
+            type=str,
+            choices=["flashinfer", "pytorch"],
+            default=ServerArgs.sampling_backend,
+            help="Choose the kernels for sampling layers.",
+        )
         parser.add_argument(
             "--disable-flashinfer",
             action="store_true",
-            help="Disable flashinfer attention kernels.",
+            help="Disable flashinfer attention kernels. This option will be deprecated in the next release. Please use '--attention-backend triton' instead.",
         )
         parser.add_argument(
             "--disable-flashinfer-sampling",
             action="store_true",
-            help="Disable flashinfer sampling kernels.",
+            help="Disable flashinfer sampling kernels. This option will be deprecated in the next release. Please use '--sampling-backend pytorch' instead.",
         )
         parser.add_argument(
             "--disable-radix-cache",
@@ -431,7 +496,13 @@ class ServerArgs:
         parser.add_argument(
             "--enable-torch-compile",
             action="store_true",
-            help="Optimize the model with torch.compile, experimental feature.",
+            help="Optimize the model with torch.compile. Experimental feature.",
+        )
+        parser.add_argument(
+            "--torchao-config",
+            type=str,
+            default=ServerArgs.torchao_config,
+            help="Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo",
         )
         parser.add_argument(
             "--enable-p2p-check",
@@ -455,6 +526,21 @@ class ServerArgs:
             help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).",
         )
+        # LoRA options
+        parser.add_argument(
+            "--lora-paths",
+            type=str,
+            nargs="*",
+            default=None,
+            help="The list of LoRA adapters.",
+        )
+        parser.add_argument(
+            "--max-loras-per-batch",
+            type=int,
+            default=8,
+            help="Maximum number of adapters for a running batch, include base-only request",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
         args.tp_size = args.tensor_parallel_size
@@ -472,14 +558,30 @@ class ServerArgs:
         assert not (
             self.dp_size > 1 and self.node_rank is not None
         ), "multi-node data parallel is not supported"
-        if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
-            logger.info(
-                "Not sure why, the tokenizer will add an additional token at the end of the prompt when trust_remote_mode=True"
-            )
-            self.trust_remote_code = False
-        if "gemma-2" in self.model_path.lower():
-            logger.info("When using sliding window in gemma-2, turn on flashinfer.")
-            self.disable_flashinfer = False
+        assert (
+            self.max_loras_per_batch > 0
+            # FIXME
+            and (self.lora_paths is None or self.disable_cuda_graph)
+            and (self.lora_paths is None or self.disable_radix_cache)
+        ), "compatibility of lora and cuda graph and radix attention is in progress"
+def prepare_server_args(argv: List[str]) -> ServerArgs:
+    """
+    Prepare the server arguments from the command line arguments.
+    Args:
+        args: The command line arguments. Typically, it should be `sys.argv[1:]`
+            to ensure compatibility with `parse_args` when no arguments are passed.
+    Returns:
+        The server arguments.
+    """
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    raw_args = parser.parse_args(argv)
+    server_args = ServerArgs.from_cli_args(raw_args)
+    return server_args
 @dataclasses.dataclass

sglang/srt/utils.py CHANGED Viewed

@@ -35,6 +35,7 @@ import torch
 import torch.distributed as dist
 from fastapi.responses import JSONResponse
 from packaging import version as pkg_version
+from torch import nn
 from torch.nn.parameter import Parameter
 from triton.runtime.cache import (
     FileCacheManager,
@@ -714,3 +715,14 @@ def configure_logger(server_args, prefix: str = ""):
         datefmt="%H:%M:%S",
         force=True,
     )
+# source: https://github.com/vllm-project/vllm/blob/93b38bea5dd03e1b140ca997dfaadef86f8f1855/vllm/lora/utils.py#L9
+def replace_submodule(
+    model: nn.Module, module_name: str, new_module: nn.Module
+) -> nn.Module:
+    """Replace a submodule in a model with a new module."""
+    parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
+    target_name = module_name.split(".")[-1]
+    setattr(parent, target_name, new_module)
+    return new_module

sglang 0.2.15__py3-none-any.whl → 0.3.1__py3-none-any.whl

sglang 0.2.15py3-none-any.whl → 0.3.1py3-none-any.whl