PyPI - sglang - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.0.post2__py3-none-any.whl - Mend

sglang 0.4.0py3-none-any.whl → 0.4.0.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

sglang/__init__.py +1 -1
sglang/bench_offline_throughput.py +18 -6
sglang/bench_one_batch.py +13 -0
sglang/bench_serving.py +8 -1
sglang/check_env.py +140 -48
sglang/lang/backend/runtime_endpoint.py +1 -0
sglang/lang/chat_template.py +32 -0
sglang/llama3_eval.py +316 -0
sglang/srt/constrained/outlines_backend.py +5 -0
sglang/srt/constrained/xgrammar_backend.py +9 -6
sglang/srt/layers/attention/__init__.py +5 -2
sglang/srt/layers/attention/double_sparsity_backend.py +22 -8
sglang/srt/layers/attention/flashinfer_backend.py +22 -5
sglang/srt/layers/attention/torch_native_backend.py +22 -8
sglang/srt/layers/attention/triton_backend.py +38 -33
sglang/srt/layers/attention/triton_ops/decode_attention.py +305 -350
sglang/srt/layers/attention/triton_ops/extend_attention.py +3 -0
sglang/srt/layers/ep_moe/__init__.py +0 -0
sglang/srt/layers/ep_moe/kernels.py +349 -0
sglang/srt/layers/ep_moe/layer.py +665 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +64 -21
sglang/srt/layers/fused_moe_triton/layer.py +1 -1
sglang/srt/layers/logits_processor.py +133 -95
sglang/srt/layers/quantization/__init__.py +2 -47
sglang/srt/layers/quantization/fp8.py +607 -0
sglang/srt/layers/quantization/fp8_utils.py +27 -0
sglang/srt/layers/radix_attention.py +11 -2
sglang/srt/layers/sampler.py +29 -5
sglang/srt/layers/torchao_utils.py +58 -45
sglang/srt/managers/detokenizer_manager.py +37 -17
sglang/srt/managers/io_struct.py +39 -10
sglang/srt/managers/schedule_batch.py +39 -24
sglang/srt/managers/schedule_policy.py +64 -5
sglang/srt/managers/scheduler.py +236 -197
sglang/srt/managers/tokenizer_manager.py +99 -58
sglang/srt/managers/tp_worker_overlap_thread.py +7 -5
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +2 -2
sglang/srt/mem_cache/memory_pool.py +5 -1
sglang/srt/mem_cache/radix_cache.py +12 -2
sglang/srt/model_executor/cuda_graph_runner.py +39 -11
sglang/srt/model_executor/model_runner.py +24 -9
sglang/srt/model_parallel.py +67 -10
sglang/srt/models/commandr.py +2 -2
sglang/srt/models/deepseek_v2.py +87 -7
sglang/srt/models/gemma2.py +34 -0
sglang/srt/models/gemma2_reward.py +0 -1
sglang/srt/models/granite.py +517 -0
sglang/srt/models/grok.py +72 -13
sglang/srt/models/llama.py +22 -5
sglang/srt/models/llama_classification.py +11 -23
sglang/srt/models/llama_reward.py +0 -2
sglang/srt/models/llava.py +37 -14
sglang/srt/models/mixtral.py +12 -9
sglang/srt/models/phi3_small.py +0 -5
sglang/srt/models/qwen2.py +20 -0
sglang/srt/models/qwen2_moe.py +0 -5
sglang/srt/models/torch_native_llama.py +0 -5
sglang/srt/openai_api/adapter.py +4 -0
sglang/srt/openai_api/protocol.py +9 -4
sglang/srt/sampling/sampling_batch_info.py +9 -8
sglang/srt/server.py +4 -4
sglang/srt/server_args.py +62 -13
sglang/srt/utils.py +57 -10
sglang/test/test_utils.py +3 -2
sglang/utils.py +10 -3
sglang/version.py +1 -1
{sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/METADATA +15 -9
{sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/RECORD +72 -65
{sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/LICENSE +0 -0
{sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/top_level.txt +0 -0

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -22,7 +22,7 @@ import signal
 import sys
 import time
 import uuid
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Union
 import fastapi
 import uvloop
@@ -76,6 +76,7 @@ class ReqState:
     out_list: List
     finished: bool
     event: asyncio.Event
+    obj: Any
     # For metrics
     created_time: float
@@ -283,7 +284,7 @@ class TokenizerManager:
     ):
         """Wait for the response of one request."""
         event = asyncio.Event()
-        state = ReqState([], False, event, created_time=created_time)
+        state = ReqState([], False, event, obj, created_time=created_time)
         self.rid_to_state[obj.rid] = state
         while True:
@@ -295,15 +296,7 @@ class TokenizerManager:
                     raise ValueError(f"Abort request {obj.rid}")
                 continue
-            if isinstance(obj, GenerateReqInput):
-                out = self.convert_logprob_style(
-                    state.out_list[-1],
-                    obj.return_logprob,
-                    obj.top_logprobs_num,
-                    obj.return_text_in_logprobs,
-                )
-            else:  # isinstance(obj, (EmbeddingReqInput,))
-                out = state.out_list[-1]
+            out = state.out_list[-1]
             state.out_list = []
             if state.finished:
@@ -315,7 +308,13 @@ class TokenizerManager:
                 break
             state.event.clear()
-            yield out
+            if obj.stream:
+                yield out
+            else:
+                if request is not None and await request.is_disconnected():
+                    self.abort_request(obj.rid)
+                    raise ValueError(f"Abort request {obj.rid}")
     async def _handle_batch_request(
         self,
@@ -573,7 +572,7 @@ class TokenizerManager:
     async def sigterm_watchdog(self):
         while not self.gracefully_exit:
-            await asyncio.sleep(60)
+            await asyncio.sleep(5)
         # drain requests
         while True:
@@ -609,29 +608,55 @@ class TokenizerManager:
                     if state is None:
                         continue
-                    recv_obj.meta_info[i]["id"] = rid
+                    meta_info = {
+                        "id": rid,
+                        "finish_reason": recv_obj.finished_reasons[i],
+                        "prompt_tokens": recv_obj.prompt_tokens[i],
+                    }
+                    if getattr(state.obj, "return_logprob", False):
+                        self.convert_logprob_style(
+                            meta_info,
+                            state.obj.top_logprobs_num,
+                            state.obj.return_text_in_logprobs,
+                            recv_obj,
+                            i,
+                        )
+                    if not isinstance(recv_obj, BatchEmbeddingOut):
+                        meta_info.update(
+                            {
+                                "completion_tokens": recv_obj.completion_tokens[i],
+                                "cached_tokens": recv_obj.cached_tokens[i],
+                            }
+                        )
                     if isinstance(recv_obj, BatchStrOut):
                         out_dict = {
                             "text": recv_obj.output_strs[i],
-                            "meta_info": recv_obj.meta_info[i],
+                            "meta_info": meta_info,
                         }
                     elif isinstance(recv_obj, BatchTokenIDOut):
                         out_dict = {
                             "token_ids": recv_obj.output_ids[i],
-                            "meta_info": recv_obj.meta_info[i],
+                            "meta_info": meta_info,
                         }
                     else:
                         assert isinstance(recv_obj, BatchEmbeddingOut)
                         out_dict = {
                             "embedding": recv_obj.embeddings[i],
-                            "meta_info": recv_obj.meta_info[i],
+                            "meta_info": meta_info,
                         }
                     state.out_list.append(out_dict)
-                    state.finished = recv_obj.finished_reason[i] is not None
+                    state.finished = recv_obj.finished_reasons[i] is not None
                     state.event.set()
                     if self.enable_metrics:
-                        completion_tokens = recv_obj.meta_info[i]["completion_tokens"]
+                        completion_tokens = (
+                            recv_obj.completion_tokens[i]
+                            if recv_obj.completion_tokens
+                            else 0
+                        )
                         if state.first_token_time is None:
                             state.first_token_time = time.time()
@@ -647,7 +672,7 @@ class TokenizerManager:
                         if state.finished:
                             self.metrics_collector.inc_prompt_tokens(
-                                recv_obj.meta_info[i]["prompt_tokens"]
+                                recv_obj.prompt_tokens[i]
                             )
                             self.metrics_collector.inc_generation_tokens(
                                 completion_tokens
@@ -696,57 +721,73 @@ class TokenizerManager:
     def convert_logprob_style(
         self,
-        ret: dict,
-        return_logprob: bool,
+        meta_info: dict,
         top_logprobs_num: int,
         return_text_in_logprobs: bool,
+        recv_obj: BatchStrOut,
+        recv_obj_index: int,
     ):
-        if return_logprob:
-            ret["meta_info"]["input_token_logprobs"] = self.detokenize_logprob_tokens(
-                ret["meta_info"]["input_token_logprobs"], return_text_in_logprobs
+        meta_info["input_token_logprobs"] = self.detokenize_logprob_tokens(
+            recv_obj.input_token_logprobs_val[recv_obj_index],
+            recv_obj.input_token_logprobs_idx[recv_obj_index],
+            return_text_in_logprobs,
+        )
+        meta_info["output_token_logprobs"] = self.detokenize_logprob_tokens(
+            recv_obj.output_token_logprobs_val[recv_obj_index],
+            recv_obj.output_token_logprobs_idx[recv_obj_index],
+            return_text_in_logprobs,
+        )
+        meta_info["normalized_prompt_logprob"] = recv_obj.normalized_prompt_logprob[
+            recv_obj_index
+        ]
+        if top_logprobs_num > 0:
+            meta_info["input_top_logprobs"] = self.detokenize_top_logprobs_tokens(
+                recv_obj.input_top_logprobs_val[recv_obj_index],
+                recv_obj.input_top_logprobs_idx[recv_obj_index],
+                return_text_in_logprobs,
             )
-            ret["meta_info"]["output_token_logprobs"] = self.detokenize_logprob_tokens(
-                ret["meta_info"]["output_token_logprobs"], return_text_in_logprobs
+            meta_info["output_top_logprobs"] = self.detokenize_top_logprobs_tokens(
+                recv_obj.output_top_logprobs_val[recv_obj_index],
+                recv_obj.output_top_logprobs_idx[recv_obj_index],
+                return_text_in_logprobs,
             )
-            if top_logprobs_num > 0:
-                ret["meta_info"]["input_top_logprobs"] = (
-                    self.detokenize_top_logprobs_tokens(
-                        ret["meta_info"]["input_top_logprobs"],
-                        return_text_in_logprobs,
-                    )
-                )
-                ret["meta_info"]["output_top_logprobs"] = (
-                    self.detokenize_top_logprobs_tokens(
-                        ret["meta_info"]["output_top_logprobs"], return_text_in_logprobs
-                    )
-                )
-        return ret
     def detokenize_logprob_tokens(
-        self, token_logprobs: List[Tuple[float, int]], decode_to_text: bool
+        self,
+        token_logprobs_val: List[float],
+        token_logprobs_idx: List[int],
+        decode_to_text: bool,
     ):
-        # TODO(lianmin): This should run on DetokenizerManager
         if not decode_to_text:
-            return [(logprob, token_id, None) for logprob, token_id in token_logprobs]
-        assert self.tokenizer is not None
-        token_ids = [tid for _, tid in token_logprobs]
-        token_texts = self.tokenizer.batch_decode(token_ids)
-        return [
-            (logprob, token_id, token_text)
-            for (logprob, token_id), token_text in zip(token_logprobs, token_texts)
-        ]
+            return [
+                (logprob, token_id, None)
+                for logprob, token_id in zip(token_logprobs_val, token_logprobs_idx)
+            ]
+        else:
+            assert self.tokenizer is not None
+            token_texts = self.tokenizer.batch_decode(token_logprobs_idx)
+            return list(zip(token_logprobs_val, token_logprobs_idx, token_texts))
-    def detokenize_top_logprobs_tokens(self, top_logprobs, decode_to_text: bool):
+    def detokenize_top_logprobs_tokens(
+        self,
+        token_logprobs_val: List[float],
+        token_logprobs_idx: List[int],
+        decode_to_text: bool,
+    ):
         # TODO: The current implementation only batches the detokenization for top-k tokens per single position.
         # We should batch all top-k tokens in all positions.
-        for i, token_top_logprobs in enumerate(top_logprobs):
-            if token_top_logprobs:
-                top_logprobs[i] = self.detokenize_logprob_tokens(
-                    token_top_logprobs, decode_to_text
+        ret = []
+        for i in range(len(token_logprobs_val)):
+            if token_logprobs_val[i]:
+                ret.append(
+                    self.detokenize_logprob_tokens(
+                        token_logprobs_val[i], token_logprobs_idx[i], decode_to_text
+                    )
                 )
-        return top_logprobs
+            else:
+                ret.append(None)
+        return ret
 class SignalHandler:

sglang/srt/managers/tp_worker_overlap_thread.py CHANGED Viewed

@@ -32,12 +32,13 @@ from sglang.srt.managers.io_struct import (
 from sglang.srt.managers.schedule_batch import ModelWorkerBatch
 from sglang.srt.managers.tp_worker import TpModelWorker
 from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import get_compiler_backend
 from sglang.utils import get_exception_traceback
 logger = logging.getLogger(__name__)
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=get_compiler_backend())
 def resolve_future_token_ids(input_ids, future_token_ids_map):
     input_ids[:] = torch.where(
         input_ids < 0,
@@ -73,12 +74,13 @@ class TpModelWorkerClient:
         # Launch threads
         self.input_queue = Queue()
         self.output_queue = Queue()
-        self.forward_stream = torch.cuda.Stream()
+        self.forward_stream = torch.get_device_module(self.device).Stream()
         self.forward_thread = threading.Thread(
             target=self.forward_thread_func,
         )
         self.forward_thread.start()
         self.parent_process = psutil.Process().parent()
+        self.scheduler_stream = torch.get_device_module(self.device).current_stream()
     def get_worker_info(self):
         return self.worker.get_worker_info()
@@ -97,7 +99,7 @@ class TpModelWorkerClient:
     def forward_thread_func(self):
         try:
-            with torch.cuda.stream(self.forward_stream):
+            with torch.get_device_module(self.device).stream(self.forward_stream):
                 self.forward_thread_func_()
         except Exception:
             traceback = get_exception_traceback()
@@ -122,7 +124,7 @@ class TpModelWorkerClient:
             # Create event
             self.launch_done = threading.Event()
-            copy_done = torch.cuda.Event()
+            copy_done = torch.get_device_module(self.device).Event()
             # Resolve future tokens in the input
             input_ids = model_worker_batch.input_ids
@@ -190,7 +192,7 @@ class TpModelWorkerClient:
         )
         # A cuda stream sync here to avoid the cuda illegal memory access error.
-        torch.cuda.current_stream().synchronize()
+        self.scheduler_stream.synchronize()
         # Push a new batch to the queue
         self.input_queue.put((model_worker_batch, self.future_token_ids_ct))

sglang/srt/mem_cache/base_prefix_cache.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Callable
+from typing import Callable, List, Tuple
 class BasePrefixCache(ABC):
@@ -10,7 +10,7 @@ class BasePrefixCache(ABC):
         pass
     @abstractmethod
-    def match_prefix(self, **kwargs):
+    def match_prefix(self, **kwargs) -> Tuple[List[int], int]:
         pass
     @abstractmethod

sglang/srt/mem_cache/chunk_cache.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 """Cache for chunked prefill, used when RadixCache is disabled."""
-from typing import TYPE_CHECKING, Callable, List, Optional
+from typing import TYPE_CHECKING, Callable, List, Optional, Tuple
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
@@ -30,7 +30,7 @@ class ChunkCache(BasePrefixCache):
     def reset(self):
         self.entries = {}
-    def match_prefix(self, rid: int, key: List[int]):
+    def match_prefix(self, rid: int, key: List[int]) -> Tuple[List[int], int]:
         if rid not in self.entries:
             return [], None

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -27,6 +27,7 @@ from typing import List, Tuple, Union
 import torch
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.utils import get_compiler_backend
 logger = logging.getLogger(__name__)
@@ -129,6 +130,9 @@ class BaseTokenToKVPool:
         return select_index.to(self.device, non_blocking=True)
     def free(self, free_index: torch.Tensor):
+        if free_index.numel() == 0:
+            return
         if self.is_not_in_free_group:
             self.free_slots = torch.concat((self.free_slots, free_index.cpu()))
         else:
@@ -234,7 +238,7 @@ class MHATokenToKVPool(BaseTokenToKVPool):
 # This compiled version is slower in the unit test
 # python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, backend=get_compiler_backend())
 def copy_two_array(loc, dst_1, src_1, dst_2, src_2, dtype, store_dtype):
     dst_1[loc] = src_1.to(dtype).view(store_dtype)
     dst_2[loc] = src_2.to(dtype).view(store_dtype)

sglang/srt/mem_cache/radix_cache.py CHANGED Viewed

@@ -22,7 +22,7 @@ The radix tree data structure for managing the KV cache.
 import heapq
 import time
 from collections import defaultdict
-from typing import TYPE_CHECKING, Callable, List, Optional
+from typing import TYPE_CHECKING, Callable, List, Optional, Tuple
 import torch
@@ -76,7 +76,17 @@ class RadixCache(BasePrefixCache):
         self.root_node.lock_ref = 1
         self.evictable_size_ = 0
-    def match_prefix(self, key: List, **kwargs):
+    def match_prefix(self, key: List[int], **kwargs) -> Tuple[torch.Tensor, int]:
+        """Find the matching prefix from the radix tree.
+        Args:
+            key: A list of token IDs to find a matching prefix.
+        Returns:
+            A tuple of a tensor of matching prefix token IDs and
+            the last node that contains the prefix values. Note that
+            this API can modify the internal state of the Radix tree.
+            The last node create a new child if the prefix is shorter
+            than the last node's value.
+        """
         if self.disable:
             return [], self.root_node

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -20,6 +20,8 @@ from contextlib import contextmanager
 from typing import TYPE_CHECKING, Callable
 import torch
+import tqdm
+from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.distributed.parallel_state import graph_capture
 from vllm.model_executor.custom_op import CustomOp
@@ -47,7 +49,7 @@ def _to_torch(model: torch.nn.Module, reverse: bool, batch_size: int):
                 if "FusedMoE" in sub.__class__.__name__:
                     if batch_size == 1:
                         # The performance of torch.compile on this layer is not always good when bs > 1,
-                        # so we decide to skip it for now.
+                        # so we decide to only use torch.compile when bs =1
                         sub._forward_method = fused_moe_forward_native
                 else:
                     sub._forward_method = sub.forward_native
@@ -127,9 +129,23 @@ class CudaGraphRunner:
         # Batch sizes to capture
         if model_runner.server_args.disable_cuda_graph_padding:
-            self.capture_bs = list(range(1, 32)) + [64, 128]
+            self.capture_bs = list(range(1, 33)) + [64, 128]
         else:
             self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
+        if max(self.capture_bs) > model_runner.req_to_token_pool.size:
+            # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
+            # is very samll. We add more values here to make sure we capture the maximum bs.
+            self.capture_bs = list(
+                sorted(
+                    set(
+                        self.capture_bs
+                        + [model_runner.req_to_token_pool.size - 1]
+                        + [model_runner.req_to_token_pool.size]
+                    )
+                )
+            )
         self.capture_bs = [
             bs
             for bs in self.capture_bs
@@ -241,7 +257,12 @@ class CudaGraphRunner:
     def capture(self):
         with graph_capture() as graph_capture_context:
             self.stream = graph_capture_context.stream
-            for bs in self.capture_bs:
+            capture_bs = (
+                tqdm.tqdm(self.capture_bs)
+                if get_tensor_model_parallel_rank() == 0
+                else self.capture_bs
+            )
+            for bs in capture_bs:
                 with patch_model(
                     self.model_runner.model,
                     bs in self.compile_bs,
@@ -373,8 +394,14 @@ class CudaGraphRunner:
         # Extract logprobs
         if forward_batch.return_logprob:
-            next_token_logprobs = torch.nn.functional.log_softmax(
-                next_token_logits, dim=-1
+            logits_metadata = LogitsMetadata(
+                forward_mode=ForwardMode.DECODE,
+                top_logprobs_nums=forward_batch.top_logprobs_nums,
+            )
+            next_token_logprobs = (
+                LogitsProcessor.compute_temp_top_p_normalized_logprobs(
+                    next_token_logits, logits_metadata
+                )
             )
             logits_output = LogitsProcessorOutput(
                 next_token_logits=next_token_logits,
@@ -382,13 +409,14 @@ class CudaGraphRunner:
             )
             return_top_logprob = any(x > 0 for x in forward_batch.top_logprobs_nums)
             if return_top_logprob:
-                logits_metadata = LogitsMetadata(
-                    forward_mode=ForwardMode.DECODE,
-                    top_logprobs_nums=forward_batch.top_logprobs_nums,
-                )
-                logits_output.output_top_logprobs = LogitsProcessor.get_top_logprobs(
+                (
+                    logits_output.output_top_logprobs_val,
+                    logits_output.output_top_logprobs_idx,
+                ) = LogitsProcessor.get_top_logprobs(
                     next_token_logprobs, logits_metadata
-                )[1]
+                )[
+                    2:4
+                ]
         else:
             logits_output = LogitsProcessorOutput(
                 next_token_logits=next_token_logits,

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -27,7 +27,6 @@ from vllm.distributed import (
     initialize_model_parallel,
     set_custom_all_reduce,
 )
-from vllm.distributed.parallel_state import in_the_same_node_as
 from sglang.srt.configs.device_config import DeviceConfig
 from sglang.srt.configs.load_config import LoadConfig
@@ -38,6 +37,7 @@ from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBack
 from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.sampler import Sampler
+from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
 from sglang.srt.lora.lora_manager import LoRAManager
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.mem_cache.memory_pool import (
@@ -111,15 +111,20 @@ class ModelRunner:
             )
         if self.is_multimodal:
-            logger.info(
-                "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
-            )
-            server_args.chunked_prefill_size = -1
             self.mem_fraction_static *= 0.95
+            if self.model_config.hf_config.architectures == [
+                "MllamaForConditionalGeneration"
+            ]:
+                logger.info("Automatically turn off --chunked-prefill-size for mllama.")
+                server_args.chunked_prefill_size = -1
             # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
             if self.model_config.hf_config.architectures == [
                 "Qwen2VLForConditionalGeneration"
             ]:
+                logger.info(
+                    "Automatically turn off --chunked-prefill-size and disable radix cache for qwen2-vl."
+                )
+                server_args.chunked_prefill_size = -1
                 server_args.disable_radix_cache = True
         # Global vars
@@ -139,6 +144,7 @@ class ModelRunner:
                 "torchao_config": server_args.torchao_config,
                 "enable_nan_detection": server_args.enable_nan_detection,
                 "enable_dp_attention": server_args.enable_dp_attention,
+                "enable_ep_moe": server_args.enable_ep_moe,
             }
         )
@@ -151,6 +157,11 @@ class ModelRunner:
         self.sampler = Sampler()
         self.load_model()
+        # Apply torchao quantization
+        apply_torchao_config_to_model(
+            self.model, global_server_args_dict["torchao_config"]
+        )
         # Apply torch TP if the model supports it
         supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
         if self.tp_size > 1 and supports_torch_tp:
@@ -235,20 +246,22 @@ class ModelRunner:
                 if torch.cuda.get_device_capability()[1] < 5:
                     raise RuntimeError("SGLang only supports sm75 and above.")
-        # Prepare the vllm model config
+        # Prepare the model config
         self.load_config = LoadConfig(
             load_format=self.server_args.load_format,
             download_dir=self.server_args.download_dir,
         )
         if self.server_args.load_format == "gguf":
             monkey_patch_vllm_gguf_config()
+        # Load the model
         self.model = get_model(
             model_config=self.model_config,
             load_config=self.load_config,
             device_config=DeviceConfig(self.device),
         )
+        # Parse other args
         self.sliding_window_size = (
             self.model.get_attention_sliding_window_size()
             if hasattr(self.model, "get_attention_sliding_window_size")
@@ -263,8 +276,10 @@ class ModelRunner:
             f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
         )
-    def update_weights_from_disk(self, model_path: str, load_format: str):
-        """Update engine weights online from disk."""
+    def update_weights_from_disk(
+        self, model_path: str, load_format: str
+    ) -> tuple[bool, str]:
+        """Update engine weights in-place from the disk."""
         from sglang.srt.model_loader.loader import (
             DefaultModelLoader,
             device_loading_context,

sglang 0.4.0__py3-none-any.whl → 0.4.0.post2__py3-none-any.whl

sglang 0.4.0py3-none-any.whl → 0.4.0.post2py3-none-any.whl