PyPI - sglang - Versions diffs - 0.5.4__tar.gz → 0.5.4.post1__tar.gz - Mend

sglang 0.5.4tar.gz → 0.5.4.post1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1088) hide show

{sglang-0.5.4/sglang.egg-info → sglang-0.5.4.post1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.5.4
+Version: 0.5.4.post1
 Summary: SGLang is a fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -223,6 +223,7 @@ Requires-Dist: datasets
 Requires-Dist: einops
 Requires-Dist: fastapi
 Requires-Dist: flashinfer_python==0.4.1
+Requires-Dist: gguf
 Requires-Dist: hf_transfer
 Requires-Dist: huggingface_hub
 Requires-Dist: interegular
@@ -251,7 +252,7 @@ Requires-Dist: requests
 Requires-Dist: scipy
 Requires-Dist: sentencepiece
 Requires-Dist: setproctitle
-Requires-Dist: sgl-kernel==0.3.16.post3
+Requires-Dist: sgl-kernel==0.3.16.post4
 Requires-Dist: soundfile==0.13.1
 Requires-Dist: tiktoken
 Requires-Dist: timm==1.0.16
@@ -274,7 +275,6 @@ Requires-Dist: nvidia-modelopt; extra == "modelopt"
 Provides-Extra: test
 Requires-Dist: accelerate; extra == "test"
 Requires-Dist: expecttest; extra == "test"
-Requires-Dist: gguf; extra == "test"
 Requires-Dist: jsonlines; extra == "test"
 Requires-Dist: matplotlib; extra == "test"
 Requires-Dist: pandas; extra == "test"
@@ -320,7 +320,7 @@ Dynamic: license-file
 --------------------------------------------------------------------------------
-| [**Blog**](https://lmsys.org/blog/2025-05-05-large-scale-ep/)
+| [**Blog**](https://lmsys.org/blog/)
 | [**Documentation**](https://docs.sglang.ai/)
 | [**Join Slack**](https://slack.sglang.ai/)
 | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
@@ -328,9 +328,10 @@ Dynamic: license-file
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/10] 🔥 AMD AI Dev Day 2025 SGLang ([slide](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/sglang_amd_ai_devday_2025.pdf)), PyTorch Conference 2025 SGLang ([slide](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/sglang_pytorch_2025.pdf)).
 - [2025/09] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part II): 3.8x Prefill, 4.8x Decode Throughput ([blog](https://lmsys.org/blog/2025-09-25-gb200-part-2/)).
-- [2025/09] 🔥 SGLang Day 0 Support for DeepSeek-V3.2 with Sparse Attention ([blog](https://lmsys.org/blog/2025-09-29-deepseek-V32/)).
-- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
+- [2025/09] SGLang Day 0 Support for DeepSeek-V3.2 with Sparse Attention ([blog](https://lmsys.org/blog/2025-09-29-deepseek-V32/)).
+- [2025/08] SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
 - [2025/08] SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
 - [2025/05] Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
 - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))

{sglang-0.5.4 → sglang-0.5.4.post1}/README.md RENAMED Viewed

@@ -12,7 +12,7 @@
 --------------------------------------------------------------------------------
-| [**Blog**](https://lmsys.org/blog/2025-05-05-large-scale-ep/)
+| [**Blog**](https://lmsys.org/blog/)
 | [**Documentation**](https://docs.sglang.ai/)
 | [**Join Slack**](https://slack.sglang.ai/)
 | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
@@ -20,9 +20,10 @@
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/10] 🔥 AMD AI Dev Day 2025 SGLang ([slide](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/sglang_amd_ai_devday_2025.pdf)), PyTorch Conference 2025 SGLang ([slide](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/sglang_pytorch_2025.pdf)).
 - [2025/09] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part II): 3.8x Prefill, 4.8x Decode Throughput ([blog](https://lmsys.org/blog/2025-09-25-gb200-part-2/)).
-- [2025/09] 🔥 SGLang Day 0 Support for DeepSeek-V3.2 with Sparse Attention ([blog](https://lmsys.org/blog/2025-09-29-deepseek-V32/)).
-- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
+- [2025/09] SGLang Day 0 Support for DeepSeek-V3.2 with Sparse Attention ([blog](https://lmsys.org/blog/2025-09-29-deepseek-V32/)).
+- [2025/08] SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
 - [2025/08] SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
 - [2025/05] Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
 - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))

{sglang-0.5.4 → sglang-0.5.4.post1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.5.4"
+version = "0.5.4.post1"
 description = "SGLang is a fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.10"
@@ -27,6 +27,7 @@ dependencies = [
   "einops",
   "fastapi",
   "flashinfer_python==0.4.1",
+  "gguf",
   "hf_transfer",
   "huggingface_hub",
   "interegular",
@@ -55,7 +56,7 @@ dependencies = [
   "scipy",
   "sentencepiece",
   "setproctitle",
-  "sgl-kernel==0.3.16.post3",
+  "sgl-kernel==0.3.16.post4",
   "soundfile==0.13.1",
   "tiktoken",
   "timm==1.0.16",
@@ -80,7 +81,6 @@ modelopt = ["nvidia-modelopt"]
 test = [
   "accelerate",
   "expecttest",
-  "gguf",
   "jsonlines",
   "matplotlib",
   "pandas",

{sglang-0.5.4 → sglang-0.5.4.post1}/sglang/bench_serving.py RENAMED Viewed

@@ -88,6 +88,7 @@ class RequestFuncOutput:
     latency: float = 0.0
     ttft: float = 0.0  # Time to first token
     itl: List[float] = field(default_factory=list)  # List of inter-token latencies
+    text_chunks: List[str] = field(default_factory=list)
     prompt_len: int = 0
     error: str = ""
     output_len: int = 0
@@ -258,6 +259,9 @@ async def async_request_openai_completions(
                                 # Decoding phase
                                 else:
+                                    output.text_chunks.append(
+                                        data["choices"][0]["text"]
+                                    )
                                     output.itl.append(timestamp - most_recent_timestamp)
                                 most_recent_timestamp = timestamp
@@ -574,9 +578,8 @@ async def async_request_sglang_generate(
                                     num_new_tokens = output_len - last_output_len
                                     if num_new_tokens == 0:
                                         continue
-                                    adjust_itl = (
-                                        timestamp - most_recent_timestamp
-                                    ) / num_new_tokens
+                                    chunk_gap = timestamp - most_recent_timestamp
+                                    adjust_itl = chunk_gap / num_new_tokens
                                     output.itl.extend([adjust_itl] * num_new_tokens)
                                 most_recent_timestamp = timestamp
@@ -764,6 +767,7 @@ def get_dataset(args, tokenizer, model_id=None):
             image_content=args.image_content,
             image_format=args.image_format,
             image_resolution=args.image_resolution,
+            backend=args.backend,
         )
     elif args.dataset_name == "generated-shared-prefix":
         assert not tokenize_prompt
@@ -781,6 +785,7 @@ def get_dataset(args, tokenizer, model_id=None):
         input_requests = sample_mmmu_requests(
             num_requests=args.num_prompts,
             processor=processor,
+            backend=args.backend,
             fixed_output_len=args.random_output_len,
             random_sample=True,
         )
@@ -1009,6 +1014,7 @@ async def get_mooncake_request_over_time(
 def sample_mmmu_requests(
     num_requests: int,
     processor: AutoProcessor | AutoTokenizer,
+    backend: str,
     fixed_output_len: Optional[int] = None,
     random_sample: bool = True,
 ) -> List[DatasetRow]:
@@ -1081,7 +1087,7 @@ def sample_mmmu_requests(
                 text_prompt = f"Question: {question}\n\nAnswer: "
                 output_len = fixed_output_len if fixed_output_len is not None else 256
                 data_row = create_mm_data_row(
-                    text_prompt, [image], [image_data], output_len, processor
+                    text_prompt, [image], [image_data], output_len, processor, backend
                 )
                 filtered_dataset.append(data_row)
@@ -1316,13 +1322,19 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
     )
-def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
+def create_mm_data_row(
+    text_prompt, images: list, images_base64, output_len, processor, backend
+):
     try:
-        content_items = [
-            {"type": "image", "image": {"url": image_base64}}
-            for image_base64 in images_base64
-        ]
-        content_items.append({"type": "text", "text": text_prompt})
+        if type(processor).__name__ == "Phi4MMProcessor":
+            # <|endoftext10|> is the image token used in the phi-4-multimodal model.
+            content_items = text_prompt.replace("image 1", "|endoftext10|")
+        else:
+            content_items = [
+                {"type": "image", "image": {"url": image_base64}}
+                for image_base64 in images_base64
+            ]
+            content_items.append({"type": "text", "text": text_prompt})
         prompt_str = processor.apply_chat_template(
             [{"role": "user", "content": content_items}],
             add_generation_prompt=True,
@@ -1362,8 +1374,16 @@ def create_mm_data_row(text_prompt, images: list, images_base64, output_len, pro
     # Vision tokens = total tokens - text tokens
     vision_prompt_len = prompt_len - text_prompt_len
+    use_raw_prompt = backend in [
+        "sglang-oai",
+        "sglang-oai-chat",
+        "vllm",
+        "vllm-chat",
+        "lmdeploy",
+        "lmdeploy-chat",
+    ]
     return DatasetRow(
-        prompt=text_prompt,
+        prompt=text_prompt if use_raw_prompt else prompt_str,
         prompt_len=prompt_len,
         output_len=output_len,
         text_prompt_len=text_prompt_len,
@@ -1382,6 +1402,7 @@ def sample_image_requests(
     image_content: str,
     image_format: str,
     image_resolution: str,
+    backend: str,
 ) -> List[DatasetRow]:
     """Generate requests with images.
@@ -1447,6 +1468,7 @@ def sample_image_requests(
             list(images_base64),
             int(output_lens[i]),
             processor,
+            backend,
         )
         dataset.append(data_row)
@@ -1607,6 +1629,7 @@ def calculate_metrics(
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
     backend: str,
+    accept_length: Optional[float] = None,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
     output_lens: List[int] = []
     retokenized_output_lens: List[int] = []
@@ -1618,6 +1641,14 @@ def calculate_metrics(
     tpots: List[float] = []
     ttfts: List[float] = []
     e2e_latencies: List[float] = []
+    retokenized_itls: List[float] = []
+    use_retokenized_itl = (
+        accept_length is not None
+        and accept_length > 0
+        and backend in ("sglang-oai", "sglang-oai-chat")
+    )
     for i in range(len(outputs)):
         if outputs[i].success:
             output_len = outputs[i].output_len
@@ -1631,7 +1662,17 @@ def calculate_metrics(
             total_input_vision += input_requests[i].vision_prompt_len
             if output_len > 1:
                 tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
-            itls += outputs[i].itl
+            if use_retokenized_itl:
+                for k, itl in enumerate(outputs[i].itl):
+                    num_tokens = len(
+                        tokenizer.encode(
+                            outputs[i].text_chunks[k], add_special_tokens=False
+                        )
+                    )
+                    adjusted_itl = itl / num_tokens
+                    retokenized_itls.extend([adjusted_itl] * num_tokens)
+            else:
+                itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
             e2e_latencies.append(outputs[i].latency)
@@ -1647,6 +1688,8 @@ def calculate_metrics(
             "on the benchmark arguments.",
             stacklevel=2,
         )
+    itls = retokenized_itls if use_retokenized_itl else itls
     metrics = BenchmarkMetrics(
         completed=completed,
         total_input=total_input,
@@ -1910,6 +1953,7 @@ async def benchmark(
         dur_s=benchmark_duration,
         tokenizer=tokenizer,
         backend=backend,
+        accept_length=accept_length,
     )
     print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))

{sglang-0.5.4 → sglang-0.5.4.post1}/sglang/launch_server.py RENAMED Viewed

@@ -12,10 +12,12 @@ if __name__ == "__main__":
     try:
         if server_args.grpc_mode:
+            # Handle gRPC server
             from sglang.srt.entrypoints.grpc_server import serve_grpc
             asyncio.run(serve_grpc(server_args))
         else:
+            # Handle HTTP server
             from sglang.srt.entrypoints.http_server import launch_server
             launch_server(server_args)

{sglang-0.5.4 → sglang-0.5.4.post1}/sglang/srt/batch_invariant_ops/batch_invariant_ops.py RENAMED Viewed

@@ -9,6 +9,22 @@ import torch
 import triton
 import triton.language as tl
+from sglang.srt.layers.deep_gemm_wrapper.configurer import ENABLE_JIT_DEEPGEMM
+from sglang.srt.utils.common import calc_diff, get_bool_env_var
+if ENABLE_JIT_DEEPGEMM:
+    import deep_gemm
+_ENABLE_MM_DEEPGEMM = get_bool_env_var(
+    "SGLANG_BATCH_INVARIANT_OPS_ENABLE_MM_DEEPGEMM", "1"
+)
+_ENABLE_MM_COMPARISON_TEST = get_bool_env_var(
+    "SGLANG_BATCH_INVARIANT_OPS_ENABLE_MM_COMPARISON_TEST"
+)
+if not _ENABLE_MM_DEEPGEMM:
+    print("Disable DeepGEMM in batch invariant ops. Performance may be suboptimal.")
 __all__ = [
     "set_batch_invariant_mode",
     "is_batch_invariant_mode_enabled",
@@ -140,7 +156,7 @@ def matmul_kernel_persistent(
         tl.store(c_ptrs, c, mask=c_mask)
-def matmul_persistent(
+def _matmul_persistent_triton(
     a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None = None
 ):
     # Check constraints.
@@ -217,6 +233,54 @@ def matmul_persistent(
     return c
+def _matmul_persistent_deepgemm(
+    a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None = None
+):
+    M, K = a.shape
+    K, N = b.shape
+    dtype = a.dtype
+    out = torch.empty((M, N), device=a.device, dtype=dtype)
+    deep_gemm.bf16_gemm_nn(a, b, out)
+    # TODO can this be put in DeepGEMM's `c`?
+    if bias is not None:
+        out += bias
+    return out
+def matmul_persistent(
+    a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None = None
+):
+    if (
+        _ENABLE_MM_DEEPGEMM
+        and ENABLE_JIT_DEEPGEMM
+        and (a.dtype == torch.bfloat16)
+        and (b.dtype == torch.bfloat16)
+        and a.is_contiguous()
+        and b.transpose(0, 1).is_contiguous()
+    ):
+        if _ENABLE_MM_COMPARISON_TEST:
+            out_triton = _matmul_persistent_triton(a=a, b=b, bias=bias)
+            out_deepgemm = _matmul_persistent_deepgemm(a=a, b=b, bias=bias)
+            diff = calc_diff(out_triton, out_deepgemm)
+            assert diff < 0.0001, f"{diff=} {out_triton=} {out_deepgemm=}"
+            # can be enabled for debugging
+            # print(
+            #     f"{diff=} "
+            #     f"{(out_triton - out_deepgemm).abs().mean()=} "
+            #     f"{(out_triton - out_deepgemm).abs().sum()=} "
+            #     f"{torch.sum(out_triton != out_deepgemm)=} "
+            # )
+            # print(f"{a=} {b=} {bias=} {out_triton=} {out_deepgemm=}")
+            return out_deepgemm
+        return _matmul_persistent_deepgemm(a=a, b=b, bias=bias)
+    return _matmul_persistent_triton(a=a, b=b, bias=bias)
 @triton.jit
 def _log_softmax_kernel(
     input_ptr,
@@ -495,16 +559,39 @@ def mean_batch_invariant(input, dim, keepdim=False, dtype: torch.dtype | None =
         return torch.sum(input, dim=dim, keepdim=keepdim, dtype=torch.float32) / n_elems
+def bmm_batch_invariant(a, b, *, out=None):
+    # Batched matrix multiply: (B, M, K) x (B, K, N) -> (B, M, N)
+    # Process each batch separately with our persistent kernel
+    if a.ndim == 3 and b.ndim == 3:
+        results = []
+        for i in range(a.shape[0]):
+            results.append(matmul_persistent(a[i], b[i]))
+        result = torch.stack(results, dim=0)
+        if out is not None:
+            out.copy_(result)
+            return out
+        return result
+    else:
+        raise ValueError(
+            f"bmm_batch_invariant expects 3D tensors, "
+            f"got shapes {a.shape} and {b.shape}"
+        )
 _batch_invariant_MODE = False
 _batch_invariant_LIB = None
+_original_torch_bmm = None
 def is_batch_invariant_mode_enabled():
     return _batch_invariant_MODE
-def enable_batch_invariant_mode():
-    global _batch_invariant_MODE, _batch_invariant_LIB
+def enable_batch_invariant_mode(
+    enable_bmm: bool = True,
+):
+    global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
     if _batch_invariant_MODE:
         return
@@ -517,11 +604,21 @@ def enable_batch_invariant_mode():
     )
     _batch_invariant_LIB.impl("aten::mean.dim", mean_batch_invariant, "CUDA")
+    if enable_bmm:
+        _batch_invariant_LIB.impl("aten::bmm", bmm_batch_invariant, "CUDA")
+        # Also monkeypatch torch.bmm directly as a fallback
+        _original_torch_bmm = torch.bmm
+        torch.bmm = bmm_batch_invariant
 def disable_batch_invariant_mode():
-    global _batch_invariant_MODE, _batch_invariant_LIB
+    global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
     if _batch_invariant_LIB is not None:
         _batch_invariant_LIB._destroy()
+    if _original_torch_bmm is not None:
+        torch.bmm = _original_torch_bmm
+        _original_torch_bmm = None
     _batch_invariant_MODE = False
     _batch_invariant_LIB = None

{sglang-0.5.4 → sglang-0.5.4.post1}/sglang/srt/compilation/backend.py RENAMED Viewed

@@ -392,7 +392,7 @@ class SGLangBackend:
         self.configure_post_pass()
         self.split_gm, self.piecewise_graphs = split_graph(
-            graph, ["sglang.unified_attention_with_output"]
+            graph, ["sglang.unified_attention_with_output", "sglang.inplace_all_reduce"]
         )
         from torch._dynamo.utils import lazy_format_graph_code

{sglang-0.5.4 → sglang-0.5.4.post1}/sglang/srt/configs/model_config.py RENAMED Viewed

@@ -535,7 +535,7 @@ class ModelConfig:
                 quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
         return quant_cfg
-    def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> dict:
+    def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]:
         """Parse ModelOpt quantization config and return the appropriate quant_method."""
         json_quant_configs = quant_config_dict["quantization"]
         quant_algo = json_quant_configs.get("quant_algo", None)
@@ -547,8 +547,7 @@ class ModelConfig:
         elif quant_algo and "FP8" in quant_algo:
             return {"quant_method": "modelopt_fp8"}
         else:
-            # Default to FP8 for backward compatibility
-            return {"quant_method": "modelopt_fp8"}
+            return None
     def _is_already_quantized(self) -> bool:
         """Check if the model is already quantized based on config files."""
@@ -806,7 +805,7 @@ def _get_and_verify_dtype(
 ) -> torch.dtype:
     # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
     # because config.torch_dtype can be None.
-    config_dtype = getattr(config, "torch_dtype", None)
+    config_dtype = getattr(config, "dtype", None)
     if isinstance(config_dtype, str):
         config_dtype = _STR_DTYPE_TO_TORCH_DTYPE.get(config_dtype, None)
     if config_dtype is None:
@@ -915,12 +914,13 @@ multimodal_model_archs = [
     "InternVLChatModel",
     "InternS1ForConditionalGeneration",
     "Phi4MMForCausalLM",
-    "VILAForConditionalGeneration",
     "Step3VLForConditionalGeneration",
     "POINTSV15ChatModel",
     "DotsVLMForCausalLM",
     "DotsOCRForCausalLM",
     "Sarashina2VisionForCausalLM",
+    "NVILAForConditionalGeneration",
+    "NVILALiteForConditionalGeneration",
     "DeepseekOCRForCausalLM",
 ]

{sglang-0.5.4 → sglang-0.5.4.post1}/sglang/srt/distributed/parallel_state.py RENAMED Viewed

@@ -340,17 +340,10 @@ class GroupCoordinator:
         self.qr_comm: Optional[QuickAllReduce] = None
         if use_custom_allreduce and self.world_size > 1:
             # Initialize a custom fast all-reduce implementation.
-            if torch_compile is not None and torch_compile:
-                # For piecewise CUDA graph, the requirement for custom allreduce is larger to
-                # avoid illegal cuda memory access.
-                ca_max_size = 256 * 1024 * 1024
-            else:
-                ca_max_size = 8 * 1024 * 1024
             try:
                 self.ca_comm = CustomAllreduce(
                     group=self.cpu_group,
                     device=self.device,
-                    max_size=ca_max_size,
                 )
             except Exception as e:
                 logger.warning(

{sglang-0.5.4 → sglang-0.5.4.post1}/sglang/srt/entrypoints/engine.py RENAMED Viewed

@@ -101,7 +101,7 @@ class Engine(EngineBase):
     Note:
     1. The HTTP server, Engine, and TokenizerManager all run in the main process.
-    2. Inter-process communication (IPC) is handled via the ZMQ library, with each process using a different port.
+    2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
     """
     def __init__(self, **kwargs):
@@ -109,6 +109,8 @@ class Engine(EngineBase):
         The arguments of this function is the same as `sglang/srt/server_args.py::ServerArgs`.
         Please refer to `ServerArgs` for the documentation.
         """
+        # Parse server_args
         if "server_args" in kwargs:
             # Directly load server_args
             server_args = kwargs["server_args"]
@@ -118,29 +120,28 @@ class Engine(EngineBase):
                 # Do not print logs by default
                 kwargs["log_level"] = "error"
             server_args = ServerArgs(**kwargs)
+        self.server_args = server_args
+        logger.info(f"{server_args=}")
         # Shutdown the subprocesses automatically when the program exits
         atexit.register(self.shutdown)
-        # Allocate ports for inter-process communications
-        self.port_args = PortArgs.init_new(server_args)
-        logger.info(f"{server_args=}")
         # Launch subprocesses
-        tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
-            server_args=server_args,
-            port_args=self.port_args,
+        tokenizer_manager, template_manager, scheduler_info, port_args = (
+            _launch_subprocesses(server_args=server_args)
         )
-        self.server_args = server_args
         self.tokenizer_manager = tokenizer_manager
         self.template_manager = template_manager
         self.scheduler_info = scheduler_info
+        self.port_args = port_args
+        # Initialize ZMQ sockets
         context = zmq.Context(2)
         self.send_to_rpc = get_zmq_socket(
             context, zmq.DEALER, self.port_args.rpc_ipc_name, True
         )
+        # Enable tracing
         if server_args.enable_trace:
             process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
             if server_args.disaggregation_mode == "null":
@@ -672,15 +673,17 @@ def _set_envs_and_config(server_args: ServerArgs):
     os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
     if not server_args.enable_symm_mem:
         os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
-    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
+    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "8"
     os.environ["CUDA_MODULE_LOADING"] = "AUTO"
-    # flashinfer uses this environment variable for various kernels from MoE to quant kernels
     if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
+        # flashinfer uses this environment variable for various kernels from MoE to quant kernels
         os.environ["TRTLLM_ENABLE_PDL"] = "1"
     if os.environ.get("CUTE_DSL_LOG_LEVEL") is None:
         # Default to warning level, to avoid too many logs
         os.environ["CUTE_DSL_LOG_LEVEL"] = "30"
     if os.environ.get("CUTE_DSL_LOG_TO_CONSOLE") is None:
         # Need to set log to console, otherwise the log level won't take effect
         os.environ["CUTE_DSL_LOG_TO_CONSOLE"] = "1"
@@ -709,7 +712,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
         assert_pkg_version(
             "sgl-kernel",
-            "0.3.16.post3",
+            "0.3.16.post4",
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )
@@ -840,7 +843,7 @@ def _launch_subprocesses(
         if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
             # When using `Engine` as a Python API, we don't want to block here.
-            return None, None, None
+            return None, None, None, port_args
         launch_dummy_health_check_server(
             server_args.host, server_args.port, server_args.enable_metrics
@@ -851,7 +854,7 @@ def _launch_subprocesses(
             logger.error(
                 f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
             )
-        return None, None, None
+        return None, None, None, port_args
     # Launch detokenizer process
     detoken_proc = mp.Process(
@@ -897,4 +900,4 @@ def _launch_subprocesses(
     tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
-    return tokenizer_manager, template_manager, scheduler_info
+    return tokenizer_manager, template_manager, scheduler_info, port_args

{sglang-0.5.4 → sglang-0.5.4.post1}/sglang/srt/entrypoints/grpc_server.py RENAMED Viewed

@@ -999,7 +999,6 @@ def _wait_and_warmup_grpc(
     # Mark health service as SERVING after warmup completes
     if health_servicer:
         health_servicer.set_serving()
-        logger.info("Health service marked as SERVING")
     logger.info("The server is fired up and ready to roll!")

sglang 0.5.4__tar.gz → 0.5.4.post1__tar.gz

sglang 0.5.4tar.gz → 0.5.4.post1tar.gz